diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/filemap.c | 183 | ||||
-rw-r--r-- | mm/filemap.h | 6 | ||||
-rw-r--r-- | mm/fremap.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 282 | ||||
-rw-r--r-- | mm/memory.c | 125 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 27 | ||||
-rw-r--r-- | mm/mempolicy.c | 36 | ||||
-rw-r--r-- | mm/migrate.c | 1058 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/mprotect.c | 37 | ||||
-rw-r--r-- | mm/msync.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 184 | ||||
-rw-r--r-- | mm/pdflush.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 107 | ||||
-rw-r--r-- | mm/shmem.c | 18 | ||||
-rw-r--r-- | mm/slab.c | 249 | ||||
-rw-r--r-- | mm/sparse.c | 22 | ||||
-rw-r--r-- | mm/swap.c | 42 | ||||
-rw-r--r-- | mm/swapfile.c | 43 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/vmalloc.c | 122 | ||||
-rw-r--r-- | mm/vmscan.c | 240 |
25 files changed, 1816 insertions, 1030 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 332f5c29b53a..66e65ab39426 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS | |||
138 | # | 138 | # |
139 | config MIGRATION | 139 | config MIGRATION |
140 | bool "Page migration" | 140 | bool "Page migration" |
141 | def_bool y if NUMA | 141 | def_bool y |
142 | depends on SWAP && NUMA | 142 | depends on NUMA |
143 | help | 143 | help |
144 | Allows the migration of the physical location of pages of processes | 144 | Allows the migration of the physical location of pages of processes |
145 | while the virtual addresses are not changed. This is useful for | 145 | while the virtual addresses are not changed. This is useful for |
diff --git a/mm/filemap.c b/mm/filemap.c index fd57442186cb..807a463fd5ed 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/compiler.h> | 15 | #include <linux/compiler.h> |
16 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
17 | #include <linux/uaccess.h> | ||
17 | #include <linux/aio.h> | 18 | #include <linux/aio.h> |
18 | #include <linux/capability.h> | 19 | #include <linux/capability.h> |
19 | #include <linux/kernel_stat.h> | 20 | #include <linux/kernel_stat.h> |
@@ -38,7 +39,6 @@ | |||
38 | */ | 39 | */ |
39 | #include <linux/buffer_head.h> /* for generic_osync_inode */ | 40 | #include <linux/buffer_head.h> /* for generic_osync_inode */ |
40 | 41 | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/mman.h> | 42 | #include <asm/mman.h> |
43 | 43 | ||
44 | static ssize_t | 44 | static ssize_t |
@@ -171,15 +171,17 @@ static int sync_page(void *word) | |||
171 | } | 171 | } |
172 | 172 | ||
173 | /** | 173 | /** |
174 | * filemap_fdatawrite_range - start writeback against all of a mapping's | 174 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range |
175 | * dirty pages that lie within the byte offsets <start, end> | ||
176 | * @mapping: address space structure to write | 175 | * @mapping: address space structure to write |
177 | * @start: offset in bytes where the range starts | 176 | * @start: offset in bytes where the range starts |
178 | * @end: offset in bytes where the range ends (inclusive) | 177 | * @end: offset in bytes where the range ends (inclusive) |
179 | * @sync_mode: enable synchronous operation | 178 | * @sync_mode: enable synchronous operation |
180 | * | 179 | * |
180 | * Start writeback against all of a mapping's dirty pages that lie | ||
181 | * within the byte offsets <start, end> inclusive. | ||
182 | * | ||
181 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | 183 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as |
182 | * opposed to a regular memory * cleansing writeback. The difference between | 184 | * opposed to a regular memory cleansing writeback. The difference between |
183 | * these two operations is that if a dirty page/buffer is encountered, it must | 185 | * these two operations is that if a dirty page/buffer is encountered, it must |
184 | * be waited upon, and not just skipped over. | 186 | * be waited upon, and not just skipped over. |
185 | */ | 187 | */ |
@@ -190,8 +192,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
190 | struct writeback_control wbc = { | 192 | struct writeback_control wbc = { |
191 | .sync_mode = sync_mode, | 193 | .sync_mode = sync_mode, |
192 | .nr_to_write = mapping->nrpages * 2, | 194 | .nr_to_write = mapping->nrpages * 2, |
193 | .start = start, | 195 | .range_start = start, |
194 | .end = end, | 196 | .range_end = end, |
195 | }; | 197 | }; |
196 | 198 | ||
197 | if (!mapping_cap_writeback_dirty(mapping)) | 199 | if (!mapping_cap_writeback_dirty(mapping)) |
@@ -204,7 +206,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
204 | static inline int __filemap_fdatawrite(struct address_space *mapping, | 206 | static inline int __filemap_fdatawrite(struct address_space *mapping, |
205 | int sync_mode) | 207 | int sync_mode) |
206 | { | 208 | { |
207 | return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); | 209 | return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); |
208 | } | 210 | } |
209 | 211 | ||
210 | int filemap_fdatawrite(struct address_space *mapping) | 212 | int filemap_fdatawrite(struct address_space *mapping) |
@@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
219 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | 221 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); |
220 | } | 222 | } |
221 | 223 | ||
222 | /* | 224 | /** |
225 | * filemap_flush - mostly a non-blocking flush | ||
226 | * @mapping: target address_space | ||
227 | * | ||
223 | * This is a mostly non-blocking flush. Not suitable for data-integrity | 228 | * This is a mostly non-blocking flush. Not suitable for data-integrity |
224 | * purposes - I/O may not be started against all dirty pages. | 229 | * purposes - I/O may not be started against all dirty pages. |
225 | */ | 230 | */ |
@@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping) | |||
229 | } | 234 | } |
230 | EXPORT_SYMBOL(filemap_flush); | 235 | EXPORT_SYMBOL(filemap_flush); |
231 | 236 | ||
232 | /* | 237 | /** |
238 | * wait_on_page_writeback_range - wait for writeback to complete | ||
239 | * @mapping: target address_space | ||
240 | * @start: beginning page index | ||
241 | * @end: ending page index | ||
242 | * | ||
233 | * Wait for writeback to complete against pages indexed by start->end | 243 | * Wait for writeback to complete against pages indexed by start->end |
234 | * inclusive | 244 | * inclusive |
235 | */ | 245 | */ |
@@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
276 | return ret; | 286 | return ret; |
277 | } | 287 | } |
278 | 288 | ||
279 | /* | 289 | /** |
290 | * sync_page_range - write and wait on all pages in the passed range | ||
291 | * @inode: target inode | ||
292 | * @mapping: target address_space | ||
293 | * @pos: beginning offset in pages to write | ||
294 | * @count: number of bytes to write | ||
295 | * | ||
280 | * Write and wait upon all the pages in the passed range. This is a "data | 296 | * Write and wait upon all the pages in the passed range. This is a "data |
281 | * integrity" operation. It waits upon in-flight writeout before starting and | 297 | * integrity" operation. It waits upon in-flight writeout before starting and |
282 | * waiting upon new writeout. If there was an IO error, return it. | 298 | * waiting upon new writeout. If there was an IO error, return it. |
@@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, | |||
305 | } | 321 | } |
306 | EXPORT_SYMBOL(sync_page_range); | 322 | EXPORT_SYMBOL(sync_page_range); |
307 | 323 | ||
308 | /* | 324 | /** |
325 | * sync_page_range_nolock | ||
326 | * @inode: target inode | ||
327 | * @mapping: target address_space | ||
328 | * @pos: beginning offset in pages to write | ||
329 | * @count: number of bytes to write | ||
330 | * | ||
309 | * Note: Holding i_mutex across sync_page_range_nolock is not a good idea | 331 | * Note: Holding i_mutex across sync_page_range_nolock is not a good idea |
310 | * as it forces O_SYNC writers to different parts of the same file | 332 | * as it forces O_SYNC writers to different parts of the same file |
311 | * to be serialised right until io completion. | 333 | * to be serialised right until io completion. |
@@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, | |||
329 | EXPORT_SYMBOL(sync_page_range_nolock); | 351 | EXPORT_SYMBOL(sync_page_range_nolock); |
330 | 352 | ||
331 | /** | 353 | /** |
332 | * filemap_fdatawait - walk the list of under-writeback pages of the given | 354 | * filemap_fdatawait - wait for all under-writeback pages to complete |
333 | * address space and wait for all of them. | ||
334 | * | ||
335 | * @mapping: address space structure to wait for | 355 | * @mapping: address space structure to wait for |
356 | * | ||
357 | * Walk the list of under-writeback pages of the given address space | ||
358 | * and wait for all of them. | ||
336 | */ | 359 | */ |
337 | int filemap_fdatawait(struct address_space *mapping) | 360 | int filemap_fdatawait(struct address_space *mapping) |
338 | { | 361 | { |
@@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping) | |||
368 | } | 391 | } |
369 | EXPORT_SYMBOL(filemap_write_and_wait); | 392 | EXPORT_SYMBOL(filemap_write_and_wait); |
370 | 393 | ||
371 | /* | 394 | /** |
395 | * filemap_write_and_wait_range - write out & wait on a file range | ||
396 | * @mapping: the address_space for the pages | ||
397 | * @lstart: offset in bytes where the range starts | ||
398 | * @lend: offset in bytes where the range ends (inclusive) | ||
399 | * | ||
372 | * Write out and wait upon file offsets lstart->lend, inclusive. | 400 | * Write out and wait upon file offsets lstart->lend, inclusive. |
373 | * | 401 | * |
374 | * Note that `lend' is inclusive (describes the last byte to be written) so | 402 | * Note that `lend' is inclusive (describes the last byte to be written) so |
@@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
394 | return err; | 422 | return err; |
395 | } | 423 | } |
396 | 424 | ||
397 | /* | 425 | /** |
398 | * This function is used to add newly allocated pagecache pages: | 426 | * add_to_page_cache - add newly allocated pagecache pages |
427 | * @page: page to add | ||
428 | * @mapping: the page's address_space | ||
429 | * @offset: page index | ||
430 | * @gfp_mask: page allocation mode | ||
431 | * | ||
432 | * This function is used to add newly allocated pagecache pages; | ||
399 | * the page is new, so we can just run SetPageLocked() against it. | 433 | * the page is new, so we can just run SetPageLocked() against it. |
400 | * The other page state flags were set by rmqueue(). | 434 | * The other page state flags were set by rmqueue(). |
401 | * | 435 | * |
@@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
422 | } | 456 | } |
423 | return error; | 457 | return error; |
424 | } | 458 | } |
425 | |||
426 | EXPORT_SYMBOL(add_to_page_cache); | 459 | EXPORT_SYMBOL(add_to_page_cache); |
427 | 460 | ||
428 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 461 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
@@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr) | |||
489 | EXPORT_SYMBOL(wait_on_page_bit); | 522 | EXPORT_SYMBOL(wait_on_page_bit); |
490 | 523 | ||
491 | /** | 524 | /** |
492 | * unlock_page() - unlock a locked page | 525 | * unlock_page - unlock a locked page |
493 | * | ||
494 | * @page: the page | 526 | * @page: the page |
495 | * | 527 | * |
496 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). | 528 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). |
@@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page) | |||
513 | } | 545 | } |
514 | EXPORT_SYMBOL(unlock_page); | 546 | EXPORT_SYMBOL(unlock_page); |
515 | 547 | ||
516 | /* | 548 | /** |
517 | * End writeback against a page. | 549 | * end_page_writeback - end writeback against a page |
550 | * @page: the page | ||
518 | */ | 551 | */ |
519 | void end_page_writeback(struct page *page) | 552 | void end_page_writeback(struct page *page) |
520 | { | 553 | { |
@@ -527,10 +560,11 @@ void end_page_writeback(struct page *page) | |||
527 | } | 560 | } |
528 | EXPORT_SYMBOL(end_page_writeback); | 561 | EXPORT_SYMBOL(end_page_writeback); |
529 | 562 | ||
530 | /* | 563 | /** |
531 | * Get a lock on the page, assuming we need to sleep to get it. | 564 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
565 | * @page: the page to lock | ||
532 | * | 566 | * |
533 | * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | 567 | * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some |
534 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | 568 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However |
535 | * chances are that on the second loop, the block layer's plug list is empty, | 569 | * chances are that on the second loop, the block layer's plug list is empty, |
536 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | 570 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. |
@@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page) | |||
544 | } | 578 | } |
545 | EXPORT_SYMBOL(__lock_page); | 579 | EXPORT_SYMBOL(__lock_page); |
546 | 580 | ||
547 | /* | 581 | /** |
548 | * a rather lightweight function, finding and getting a reference to a | 582 | * find_get_page - find and get a page reference |
583 | * @mapping: the address_space to search | ||
584 | * @offset: the page index | ||
585 | * | ||
586 | * A rather lightweight function, finding and getting a reference to a | ||
549 | * hashed page atomically. | 587 | * hashed page atomically. |
550 | */ | 588 | */ |
551 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 589 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) |
@@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) | |||
559 | read_unlock_irq(&mapping->tree_lock); | 597 | read_unlock_irq(&mapping->tree_lock); |
560 | return page; | 598 | return page; |
561 | } | 599 | } |
562 | |||
563 | EXPORT_SYMBOL(find_get_page); | 600 | EXPORT_SYMBOL(find_get_page); |
564 | 601 | ||
565 | /* | 602 | /** |
566 | * Same as above, but trylock it instead of incrementing the count. | 603 | * find_trylock_page - find and lock a page |
604 | * @mapping: the address_space to search | ||
605 | * @offset: the page index | ||
606 | * | ||
607 | * Same as find_get_page(), but trylock it instead of incrementing the count. | ||
567 | */ | 608 | */ |
568 | struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) | 609 | struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) |
569 | { | 610 | { |
@@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs | |||
576 | read_unlock_irq(&mapping->tree_lock); | 617 | read_unlock_irq(&mapping->tree_lock); |
577 | return page; | 618 | return page; |
578 | } | 619 | } |
579 | |||
580 | EXPORT_SYMBOL(find_trylock_page); | 620 | EXPORT_SYMBOL(find_trylock_page); |
581 | 621 | ||
582 | /** | 622 | /** |
583 | * find_lock_page - locate, pin and lock a pagecache page | 623 | * find_lock_page - locate, pin and lock a pagecache page |
584 | * | ||
585 | * @mapping: the address_space to search | 624 | * @mapping: the address_space to search |
586 | * @offset: the page index | 625 | * @offset: the page index |
587 | * | 626 | * |
@@ -617,12 +656,10 @@ repeat: | |||
617 | read_unlock_irq(&mapping->tree_lock); | 656 | read_unlock_irq(&mapping->tree_lock); |
618 | return page; | 657 | return page; |
619 | } | 658 | } |
620 | |||
621 | EXPORT_SYMBOL(find_lock_page); | 659 | EXPORT_SYMBOL(find_lock_page); |
622 | 660 | ||
623 | /** | 661 | /** |
624 | * find_or_create_page - locate or add a pagecache page | 662 | * find_or_create_page - locate or add a pagecache page |
625 | * | ||
626 | * @mapping: the page's address_space | 663 | * @mapping: the page's address_space |
627 | * @index: the page's index into the mapping | 664 | * @index: the page's index into the mapping |
628 | * @gfp_mask: page allocation mode | 665 | * @gfp_mask: page allocation mode |
@@ -663,7 +700,6 @@ repeat: | |||
663 | page_cache_release(cached_page); | 700 | page_cache_release(cached_page); |
664 | return page; | 701 | return page; |
665 | } | 702 | } |
666 | |||
667 | EXPORT_SYMBOL(find_or_create_page); | 703 | EXPORT_SYMBOL(find_or_create_page); |
668 | 704 | ||
669 | /** | 705 | /** |
@@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
729 | return i; | 765 | return i; |
730 | } | 766 | } |
731 | 767 | ||
732 | /* | 768 | /** |
769 | * find_get_pages_tag - find and return pages that match @tag | ||
770 | * @mapping: the address_space to search | ||
771 | * @index: the starting page index | ||
772 | * @tag: the tag index | ||
773 | * @nr_pages: the maximum number of pages | ||
774 | * @pages: where the resulting pages are placed | ||
775 | * | ||
733 | * Like find_get_pages, except we only return pages which are tagged with | 776 | * Like find_get_pages, except we only return pages which are tagged with |
734 | * `tag'. We update *index to index the next page for the traversal. | 777 | * @tag. We update @index to index the next page for the traversal. |
735 | */ | 778 | */ |
736 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 779 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, |
737 | int tag, unsigned int nr_pages, struct page **pages) | 780 | int tag, unsigned int nr_pages, struct page **pages) |
@@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
750 | return ret; | 793 | return ret; |
751 | } | 794 | } |
752 | 795 | ||
753 | /* | 796 | /** |
797 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
798 | * @mapping: target address_space | ||
799 | * @index: the page index | ||
800 | * | ||
754 | * Same as grab_cache_page, but do not wait if the page is unavailable. | 801 | * Same as grab_cache_page, but do not wait if the page is unavailable. |
755 | * This is intended for speculative data generators, where the data can | 802 | * This is intended for speculative data generators, where the data can |
756 | * be regenerated if the page couldn't be grabbed. This routine should | 803 | * be regenerated if the page couldn't be grabbed. This routine should |
@@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | |||
779 | } | 826 | } |
780 | return page; | 827 | return page; |
781 | } | 828 | } |
782 | |||
783 | EXPORT_SYMBOL(grab_cache_page_nowait); | 829 | EXPORT_SYMBOL(grab_cache_page_nowait); |
784 | 830 | ||
785 | /* | 831 | /** |
832 | * do_generic_mapping_read - generic file read routine | ||
833 | * @mapping: address_space to be read | ||
834 | * @_ra: file's readahead state | ||
835 | * @filp: the file to read | ||
836 | * @ppos: current file position | ||
837 | * @desc: read_descriptor | ||
838 | * @actor: read method | ||
839 | * | ||
786 | * This is a generic file read routine, and uses the | 840 | * This is a generic file read routine, and uses the |
787 | * mapping->a_ops->readpage() function for the actual low-level | 841 | * mapping->a_ops->readpage() function for the actual low-level stuff. |
788 | * stuff. | ||
789 | * | 842 | * |
790 | * This is really ugly. But the goto's actually try to clarify some | 843 | * This is really ugly. But the goto's actually try to clarify some |
791 | * of the logic when it comes to error handling etc. | 844 | * of the logic when it comes to error handling etc. |
792 | * | 845 | * |
793 | * Note the struct file* is only passed for the use of readpage. It may be | 846 | * Note the struct file* is only passed for the use of readpage. |
794 | * NULL. | 847 | * It may be NULL. |
795 | */ | 848 | */ |
796 | void do_generic_mapping_read(struct address_space *mapping, | 849 | void do_generic_mapping_read(struct address_space *mapping, |
797 | struct file_ra_state *_ra, | 850 | struct file_ra_state *_ra, |
@@ -1004,7 +1057,6 @@ out: | |||
1004 | if (filp) | 1057 | if (filp) |
1005 | file_accessed(filp); | 1058 | file_accessed(filp); |
1006 | } | 1059 | } |
1007 | |||
1008 | EXPORT_SYMBOL(do_generic_mapping_read); | 1060 | EXPORT_SYMBOL(do_generic_mapping_read); |
1009 | 1061 | ||
1010 | int file_read_actor(read_descriptor_t *desc, struct page *page, | 1062 | int file_read_actor(read_descriptor_t *desc, struct page *page, |
@@ -1045,7 +1097,13 @@ success: | |||
1045 | return size; | 1097 | return size; |
1046 | } | 1098 | } |
1047 | 1099 | ||
1048 | /* | 1100 | /** |
1101 | * __generic_file_aio_read - generic filesystem read routine | ||
1102 | * @iocb: kernel I/O control block | ||
1103 | * @iov: io vector request | ||
1104 | * @nr_segs: number of segments in the iovec | ||
1105 | * @ppos: current file position | ||
1106 | * | ||
1049 | * This is the "read()" routine for all filesystems | 1107 | * This is the "read()" routine for all filesystems |
1050 | * that can use the page cache directly. | 1108 | * that can use the page cache directly. |
1051 | */ | 1109 | */ |
@@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1124 | out: | 1182 | out: |
1125 | return retval; | 1183 | return retval; |
1126 | } | 1184 | } |
1127 | |||
1128 | EXPORT_SYMBOL(__generic_file_aio_read); | 1185 | EXPORT_SYMBOL(__generic_file_aio_read); |
1129 | 1186 | ||
1130 | ssize_t | 1187 | ssize_t |
@@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t | |||
1135 | BUG_ON(iocb->ki_pos != pos); | 1192 | BUG_ON(iocb->ki_pos != pos); |
1136 | return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); | 1193 | return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); |
1137 | } | 1194 | } |
1138 | |||
1139 | EXPORT_SYMBOL(generic_file_aio_read); | 1195 | EXPORT_SYMBOL(generic_file_aio_read); |
1140 | 1196 | ||
1141 | ssize_t | 1197 | ssize_t |
@@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo | |||
1151 | ret = wait_on_sync_kiocb(&kiocb); | 1207 | ret = wait_on_sync_kiocb(&kiocb); |
1152 | return ret; | 1208 | return ret; |
1153 | } | 1209 | } |
1154 | |||
1155 | EXPORT_SYMBOL(generic_file_read); | 1210 | EXPORT_SYMBOL(generic_file_read); |
1156 | 1211 | ||
1157 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) | 1212 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) |
@@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, | |||
1192 | return desc.written; | 1247 | return desc.written; |
1193 | return desc.error; | 1248 | return desc.error; |
1194 | } | 1249 | } |
1195 | |||
1196 | EXPORT_SYMBOL(generic_file_sendfile); | 1250 | EXPORT_SYMBOL(generic_file_sendfile); |
1197 | 1251 | ||
1198 | static ssize_t | 1252 | static ssize_t |
@@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1228 | } | 1282 | } |
1229 | 1283 | ||
1230 | #ifdef CONFIG_MMU | 1284 | #ifdef CONFIG_MMU |
1231 | /* | 1285 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); |
1286 | /** | ||
1287 | * page_cache_read - adds requested page to the page cache if not already there | ||
1288 | * @file: file to read | ||
1289 | * @offset: page index | ||
1290 | * | ||
1232 | * This adds the requested page to the page cache if it isn't already there, | 1291 | * This adds the requested page to the page cache if it isn't already there, |
1233 | * and schedules an I/O to read in its contents from disk. | 1292 | * and schedules an I/O to read in its contents from disk. |
1234 | */ | 1293 | */ |
1235 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | ||
1236 | static int fastcall page_cache_read(struct file * file, unsigned long offset) | 1294 | static int fastcall page_cache_read(struct file * file, unsigned long offset) |
1237 | { | 1295 | { |
1238 | struct address_space *mapping = file->f_mapping; | 1296 | struct address_space *mapping = file->f_mapping; |
@@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) | |||
1259 | 1317 | ||
1260 | #define MMAP_LOTSAMISS (100) | 1318 | #define MMAP_LOTSAMISS (100) |
1261 | 1319 | ||
1262 | /* | 1320 | /** |
1321 | * filemap_nopage - read in file data for page fault handling | ||
1322 | * @area: the applicable vm_area | ||
1323 | * @address: target address to read in | ||
1324 | * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL | ||
1325 | * | ||
1263 | * filemap_nopage() is invoked via the vma operations vector for a | 1326 | * filemap_nopage() is invoked via the vma operations vector for a |
1264 | * mapped memory region to read in file data during a page fault. | 1327 | * mapped memory region to read in file data during a page fault. |
1265 | * | 1328 | * |
@@ -1462,7 +1525,6 @@ page_not_uptodate: | |||
1462 | page_cache_release(page); | 1525 | page_cache_release(page); |
1463 | return NULL; | 1526 | return NULL; |
1464 | } | 1527 | } |
1465 | |||
1466 | EXPORT_SYMBOL(filemap_nopage); | 1528 | EXPORT_SYMBOL(filemap_nopage); |
1467 | 1529 | ||
1468 | static struct page * filemap_getpage(struct file *file, unsigned long pgoff, | 1530 | static struct page * filemap_getpage(struct file *file, unsigned long pgoff, |
@@ -1716,7 +1778,13 @@ repeat: | |||
1716 | return page; | 1778 | return page; |
1717 | } | 1779 | } |
1718 | 1780 | ||
1719 | /* | 1781 | /** |
1782 | * read_cache_page - read into page cache, fill it if needed | ||
1783 | * @mapping: the page's address_space | ||
1784 | * @index: the page index | ||
1785 | * @filler: function to perform the read | ||
1786 | * @data: destination for read data | ||
1787 | * | ||
1720 | * Read into the page cache. If a page already exists, | 1788 | * Read into the page cache. If a page already exists, |
1721 | * and PageUptodate() is not set, try to fill the page. | 1789 | * and PageUptodate() is not set, try to fill the page. |
1722 | */ | 1790 | */ |
@@ -1754,7 +1822,6 @@ retry: | |||
1754 | out: | 1822 | out: |
1755 | return page; | 1823 | return page; |
1756 | } | 1824 | } |
1757 | |||
1758 | EXPORT_SYMBOL(read_cache_page); | 1825 | EXPORT_SYMBOL(read_cache_page); |
1759 | 1826 | ||
1760 | /* | 1827 | /* |
@@ -1835,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr, | |||
1835 | int copy = min(bytes, iov->iov_len - base); | 1902 | int copy = min(bytes, iov->iov_len - base); |
1836 | 1903 | ||
1837 | base = 0; | 1904 | base = 0; |
1838 | left = __copy_from_user_inatomic(vaddr, buf, copy); | 1905 | left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); |
1839 | copied += copy; | 1906 | copied += copy; |
1840 | bytes -= copy; | 1907 | bytes -= copy; |
1841 | vaddr += copy; | 1908 | vaddr += copy; |
@@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr, | |||
1854 | /* | 1921 | /* |
1855 | * Performs necessary checks before doing a write | 1922 | * Performs necessary checks before doing a write |
1856 | * | 1923 | * |
1857 | * Can adjust writing position aor amount of bytes to write. | 1924 | * Can adjust writing position or amount of bytes to write. |
1858 | * Returns appropriate error code that caller should return or | 1925 | * Returns appropriate error code that caller should return or |
1859 | * zero in case that write should be allowed. | 1926 | * zero in case that write should be allowed. |
1860 | */ | 1927 | */ |
diff --git a/mm/filemap.h b/mm/filemap.h index 13793ba0ce17..5683cde22055 100644 --- a/mm/filemap.h +++ b/mm/filemap.h | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/highmem.h> | 13 | #include <linux/highmem.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/config.h> | 15 | #include <linux/config.h> |
16 | #include <asm/uaccess.h> | 16 | #include <linux/uaccess.h> |
17 | 17 | ||
18 | size_t | 18 | size_t |
19 | __filemap_copy_from_user_iovec(char *vaddr, | 19 | __filemap_copy_from_user_iovec(char *vaddr, |
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset, | |||
34 | int left; | 34 | int left; |
35 | 35 | ||
36 | kaddr = kmap_atomic(page, KM_USER0); | 36 | kaddr = kmap_atomic(page, KM_USER0); |
37 | left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); | 37 | left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); |
38 | kunmap_atomic(kaddr, KM_USER0); | 38 | kunmap_atomic(kaddr, KM_USER0); |
39 | 39 | ||
40 | if (left != 0) { | 40 | if (left != 0) { |
41 | /* Do it the slow way */ | 41 | /* Do it the slow way */ |
42 | kaddr = kmap(page); | 42 | kaddr = kmap(page); |
43 | left = __copy_from_user(kaddr + offset, buf, bytes); | 43 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); |
44 | kunmap(page); | 44 | kunmap(page); |
45 | } | 45 | } |
46 | return bytes - left; | 46 | return bytes - left; |
diff --git a/mm/fremap.c b/mm/fremap.c index 9f381e58bf44..21b7d0cbc98c 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
83 | page_add_file_rmap(page); | 83 | page_add_file_rmap(page); |
84 | pte_val = *pte; | 84 | pte_val = *pte; |
85 | update_mmu_cache(vma, addr, pte_val); | 85 | update_mmu_cache(vma, addr, pte_val); |
86 | lazy_mmu_prot_update(pte_val); | ||
86 | err = 0; | 87 | err = 0; |
87 | unlock: | 88 | unlock: |
88 | pte_unmap_unlock(pte, ptl); | 89 | pte_unmap_unlock(pte, ptl); |
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
114 | 115 | ||
115 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 116 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
116 | pte_val = *pte; | 117 | pte_val = *pte; |
117 | update_mmu_cache(vma, addr, pte_val); | 118 | /* |
119 | * We don't need to run update_mmu_cache() here because the "file pte" | ||
120 | * being installed by install_file_pte() is not a real pte - it's a | ||
121 | * non-present entry (like a swap entry), noting what file offset should | ||
122 | * be mapped there when there's a fault (in a non-linear vma where | ||
123 | * that's not obvious). | ||
124 | */ | ||
118 | pte_unmap_unlock(pte, ptl); | 125 | pte_unmap_unlock(pte, ptl); |
119 | err = 0; | 126 | err = 0; |
120 | out: | 127 | out: |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca038..df499973255f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void) | |||
123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
124 | unsigned long addr) | 124 | unsigned long addr) |
125 | { | 125 | { |
126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
127 | struct page *page; | 126 | struct page *page; |
128 | int use_reserve = 0; | ||
129 | unsigned long idx; | ||
130 | 127 | ||
131 | spin_lock(&hugetlb_lock); | 128 | spin_lock(&hugetlb_lock); |
132 | 129 | if (vma->vm_flags & VM_MAYSHARE) | |
133 | if (vma->vm_flags & VM_MAYSHARE) { | 130 | resv_huge_pages--; |
134 | 131 | else if (free_huge_pages <= resv_huge_pages) | |
135 | /* idx = radix tree index, i.e. offset into file in | 132 | goto fail; |
136 | * HPAGE_SIZE units */ | ||
137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
139 | |||
140 | /* The hugetlbfs specific inode info stores the number | ||
141 | * of "guaranteed available" (huge) pages. That is, | ||
142 | * the first 'prereserved_hpages' pages of the inode | ||
143 | * are either already instantiated, or have been | ||
144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
145 | * we're in the process of instantiating the page, so | ||
146 | * we use this to determine whether to draw from the | ||
147 | * pre-reserved pool or the truly free pool. */ | ||
148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
149 | use_reserve = 1; | ||
150 | } | ||
151 | |||
152 | if (!use_reserve) { | ||
153 | if (free_huge_pages <= reserved_huge_pages) | ||
154 | goto fail; | ||
155 | } else { | ||
156 | BUG_ON(reserved_huge_pages == 0); | ||
157 | reserved_huge_pages--; | ||
158 | } | ||
159 | 133 | ||
160 | page = dequeue_huge_page(vma, addr); | 134 | page = dequeue_huge_page(vma, addr); |
161 | if (!page) | 135 | if (!page) |
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
165 | set_page_refcounted(page); | 139 | set_page_refcounted(page); |
166 | return page; | 140 | return page; |
167 | 141 | ||
168 | fail: | 142 | fail: |
169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
170 | spin_unlock(&hugetlb_lock); | 143 | spin_unlock(&hugetlb_lock); |
171 | return NULL; | 144 | return NULL; |
172 | } | 145 | } |
173 | 146 | ||
174 | /* hugetlb_extend_reservation() | ||
175 | * | ||
176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
177 | * available to instantiate the first 'atleast' pages of the given | ||
178 | * inode. If the inode doesn't already have this many pages reserved | ||
179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
181 | * than getting the SIGBUS later). | ||
182 | */ | ||
183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
184 | unsigned long atleast) | ||
185 | { | ||
186 | struct inode *inode = &info->vfs_inode; | ||
187 | unsigned long change_in_reserve = 0; | ||
188 | int ret = 0; | ||
189 | |||
190 | spin_lock(&hugetlb_lock); | ||
191 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
192 | |||
193 | if (info->prereserved_hpages >= atleast) | ||
194 | goto out; | ||
195 | |||
196 | /* Because we always call this on shared mappings, none of the | ||
197 | * pages beyond info->prereserved_hpages can have been | ||
198 | * instantiated, so we need to reserve all of them now. */ | ||
199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
200 | |||
201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
202 | ret = -ENOMEM; | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | reserved_huge_pages += change_in_reserve; | ||
207 | info->prereserved_hpages = atleast; | ||
208 | |||
209 | out: | ||
210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
211 | spin_unlock(&hugetlb_lock); | ||
212 | |||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* hugetlb_truncate_reservation() | ||
217 | * | ||
218 | * This returns pages reserved for the given inode to the general free | ||
219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
221 | * them. | ||
222 | */ | ||
223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
224 | unsigned long atmost) | ||
225 | { | ||
226 | struct inode *inode = &info->vfs_inode; | ||
227 | struct address_space *mapping = inode->i_mapping; | ||
228 | unsigned long idx; | ||
229 | unsigned long change_in_reserve = 0; | ||
230 | struct page *page; | ||
231 | |||
232 | spin_lock(&hugetlb_lock); | ||
233 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
234 | |||
235 | if (info->prereserved_hpages <= atmost) | ||
236 | goto out; | ||
237 | |||
238 | /* Count pages which were reserved, but not instantiated, and | ||
239 | * which we can now release. */ | ||
240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
242 | if (!page) | ||
243 | /* Pages which are already instantiated can't | ||
244 | * be unreserved (and in fact have already | ||
245 | * been removed from the reserved pool) */ | ||
246 | change_in_reserve++; | ||
247 | } | ||
248 | |||
249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
250 | reserved_huge_pages -= change_in_reserve; | ||
251 | info->prereserved_hpages = atmost; | ||
252 | |||
253 | out: | ||
254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
255 | spin_unlock(&hugetlb_lock); | ||
256 | } | ||
257 | |||
258 | static int __init hugetlb_init(void) | 147 | static int __init hugetlb_init(void) |
259 | { | 148 | { |
260 | unsigned long i; | 149 | unsigned long i; |
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
334 | return nr_huge_pages; | 223 | return nr_huge_pages; |
335 | 224 | ||
336 | spin_lock(&hugetlb_lock); | 225 | spin_lock(&hugetlb_lock); |
337 | count = max(count, reserved_huge_pages); | 226 | count = max(count, resv_huge_pages); |
338 | try_to_free_low(count); | 227 | try_to_free_low(count); |
339 | while (count < nr_huge_pages) { | 228 | while (count < nr_huge_pages) { |
340 | struct page *page = dequeue_huge_page(NULL, 0); | 229 | struct page *page = dequeue_huge_page(NULL, 0); |
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf) | |||
361 | return sprintf(buf, | 250 | return sprintf(buf, |
362 | "HugePages_Total: %5lu\n" | 251 | "HugePages_Total: %5lu\n" |
363 | "HugePages_Free: %5lu\n" | 252 | "HugePages_Free: %5lu\n" |
364 | "HugePages_Rsvd: %5lu\n" | 253 | "HugePages_Rsvd: %5lu\n" |
365 | "Hugepagesize: %5lu kB\n", | 254 | "Hugepagesize: %5lu kB\n", |
366 | nr_huge_pages, | 255 | nr_huge_pages, |
367 | free_huge_pages, | 256 | free_huge_pages, |
368 | reserved_huge_pages, | 257 | resv_huge_pages, |
369 | HPAGE_SIZE/1024); | 258 | HPAGE_SIZE/1024); |
370 | } | 259 | } |
371 | 260 | ||
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
754 | flush_tlb_range(vma, start, end); | 643 | flush_tlb_range(vma, start, end); |
755 | } | 644 | } |
756 | 645 | ||
646 | struct file_region { | ||
647 | struct list_head link; | ||
648 | long from; | ||
649 | long to; | ||
650 | }; | ||
651 | |||
652 | static long region_add(struct list_head *head, long f, long t) | ||
653 | { | ||
654 | struct file_region *rg, *nrg, *trg; | ||
655 | |||
656 | /* Locate the region we are either in or before. */ | ||
657 | list_for_each_entry(rg, head, link) | ||
658 | if (f <= rg->to) | ||
659 | break; | ||
660 | |||
661 | /* Round our left edge to the current segment if it encloses us. */ | ||
662 | if (f > rg->from) | ||
663 | f = rg->from; | ||
664 | |||
665 | /* Check for and consume any regions we now overlap with. */ | ||
666 | nrg = rg; | ||
667 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
668 | if (&rg->link == head) | ||
669 | break; | ||
670 | if (rg->from > t) | ||
671 | break; | ||
672 | |||
673 | /* If this area reaches higher then extend our area to | ||
674 | * include it completely. If this is not the first area | ||
675 | * which we intend to reuse, free it. */ | ||
676 | if (rg->to > t) | ||
677 | t = rg->to; | ||
678 | if (rg != nrg) { | ||
679 | list_del(&rg->link); | ||
680 | kfree(rg); | ||
681 | } | ||
682 | } | ||
683 | nrg->from = f; | ||
684 | nrg->to = t; | ||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | static long region_chg(struct list_head *head, long f, long t) | ||
689 | { | ||
690 | struct file_region *rg, *nrg; | ||
691 | long chg = 0; | ||
692 | |||
693 | /* Locate the region we are before or in. */ | ||
694 | list_for_each_entry(rg, head, link) | ||
695 | if (f <= rg->to) | ||
696 | break; | ||
697 | |||
698 | /* If we are below the current region then a new region is required. | ||
699 | * Subtle, allocate a new region at the position but make it zero | ||
700 | * size such that we can guarentee to record the reservation. */ | ||
701 | if (&rg->link == head || t < rg->from) { | ||
702 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
703 | if (nrg == 0) | ||
704 | return -ENOMEM; | ||
705 | nrg->from = f; | ||
706 | nrg->to = f; | ||
707 | INIT_LIST_HEAD(&nrg->link); | ||
708 | list_add(&nrg->link, rg->link.prev); | ||
709 | |||
710 | return t - f; | ||
711 | } | ||
712 | |||
713 | /* Round our left edge to the current segment if it encloses us. */ | ||
714 | if (f > rg->from) | ||
715 | f = rg->from; | ||
716 | chg = t - f; | ||
717 | |||
718 | /* Check for and consume any regions we now overlap with. */ | ||
719 | list_for_each_entry(rg, rg->link.prev, link) { | ||
720 | if (&rg->link == head) | ||
721 | break; | ||
722 | if (rg->from > t) | ||
723 | return chg; | ||
724 | |||
725 | /* We overlap with this area, if it extends futher than | ||
726 | * us then we must extend ourselves. Account for its | ||
727 | * existing reservation. */ | ||
728 | if (rg->to > t) { | ||
729 | chg += rg->to - t; | ||
730 | t = rg->to; | ||
731 | } | ||
732 | chg -= rg->to - rg->from; | ||
733 | } | ||
734 | return chg; | ||
735 | } | ||
736 | |||
737 | static long region_truncate(struct list_head *head, long end) | ||
738 | { | ||
739 | struct file_region *rg, *trg; | ||
740 | long chg = 0; | ||
741 | |||
742 | /* Locate the region we are either in or before. */ | ||
743 | list_for_each_entry(rg, head, link) | ||
744 | if (end <= rg->to) | ||
745 | break; | ||
746 | if (&rg->link == head) | ||
747 | return 0; | ||
748 | |||
749 | /* If we are in the middle of a region then adjust it. */ | ||
750 | if (end > rg->from) { | ||
751 | chg = rg->to - end; | ||
752 | rg->to = end; | ||
753 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
754 | } | ||
755 | |||
756 | /* Drop any remaining regions. */ | ||
757 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
758 | if (&rg->link == head) | ||
759 | break; | ||
760 | chg += rg->to - rg->from; | ||
761 | list_del(&rg->link); | ||
762 | kfree(rg); | ||
763 | } | ||
764 | return chg; | ||
765 | } | ||
766 | |||
767 | static int hugetlb_acct_memory(long delta) | ||
768 | { | ||
769 | int ret = -ENOMEM; | ||
770 | |||
771 | spin_lock(&hugetlb_lock); | ||
772 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
773 | resv_huge_pages += delta; | ||
774 | ret = 0; | ||
775 | } | ||
776 | spin_unlock(&hugetlb_lock); | ||
777 | return ret; | ||
778 | } | ||
779 | |||
780 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
781 | { | ||
782 | long ret, chg; | ||
783 | |||
784 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
785 | if (chg < 0) | ||
786 | return chg; | ||
787 | ret = hugetlb_acct_memory(chg); | ||
788 | if (ret < 0) | ||
789 | return ret; | ||
790 | region_add(&inode->i_mapping->private_list, from, to); | ||
791 | return 0; | ||
792 | } | ||
793 | |||
794 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | ||
795 | { | ||
796 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | ||
797 | hugetlb_acct_memory(freed - chg); | ||
798 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index 0ec7bc644271..247b5c312b9b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
434 | /* pte contains position in swap or file, so copy. */ | 434 | /* pte contains position in swap or file, so copy. */ |
435 | if (unlikely(!pte_present(pte))) { | 435 | if (unlikely(!pte_present(pte))) { |
436 | if (!pte_file(pte)) { | 436 | if (!pte_file(pte)) { |
437 | swap_duplicate(pte_to_swp_entry(pte)); | 437 | swp_entry_t entry = pte_to_swp_entry(pte); |
438 | |||
439 | swap_duplicate(entry); | ||
438 | /* make sure dst_mm is on swapoff's mmlist. */ | 440 | /* make sure dst_mm is on swapoff's mmlist. */ |
439 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 441 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
440 | spin_lock(&mmlist_lock); | 442 | spin_lock(&mmlist_lock); |
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
443 | &src_mm->mmlist); | 445 | &src_mm->mmlist); |
444 | spin_unlock(&mmlist_lock); | 446 | spin_unlock(&mmlist_lock); |
445 | } | 447 | } |
448 | if (is_write_migration_entry(entry) && | ||
449 | is_cow_mapping(vm_flags)) { | ||
450 | /* | ||
451 | * COW mappings require pages in both parent | ||
452 | * and child to be set to read. | ||
453 | */ | ||
454 | make_migration_entry_read(&entry); | ||
455 | pte = swp_entry_to_pte(entry); | ||
456 | set_pte_at(src_mm, addr, src_pte, pte); | ||
457 | } | ||
446 | } | 458 | } |
447 | goto out_set_pte; | 459 | goto out_set_pte; |
448 | } | 460 | } |
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1445 | { | 1457 | { |
1446 | struct page *old_page, *new_page; | 1458 | struct page *old_page, *new_page; |
1447 | pte_t entry; | 1459 | pte_t entry; |
1448 | int ret = VM_FAULT_MINOR; | 1460 | int reuse, ret = VM_FAULT_MINOR; |
1449 | 1461 | ||
1450 | old_page = vm_normal_page(vma, address, orig_pte); | 1462 | old_page = vm_normal_page(vma, address, orig_pte); |
1451 | if (!old_page) | 1463 | if (!old_page) |
1452 | goto gotten; | 1464 | goto gotten; |
1453 | 1465 | ||
1454 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | 1466 | if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == |
1455 | int reuse = can_share_swap_page(old_page); | 1467 | (VM_SHARED|VM_WRITE))) { |
1456 | unlock_page(old_page); | 1468 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
1457 | if (reuse) { | 1469 | /* |
1458 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1470 | * Notify the address space that the page is about to |
1459 | entry = pte_mkyoung(orig_pte); | 1471 | * become writable so that it can prohibit this or wait |
1460 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1472 | * for the page to get into an appropriate state. |
1461 | ptep_set_access_flags(vma, address, page_table, entry, 1); | 1473 | * |
1462 | update_mmu_cache(vma, address, entry); | 1474 | * We do this without the lock held, so that it can |
1463 | lazy_mmu_prot_update(entry); | 1475 | * sleep if it needs to. |
1464 | ret |= VM_FAULT_WRITE; | 1476 | */ |
1465 | goto unlock; | 1477 | page_cache_get(old_page); |
1478 | pte_unmap_unlock(page_table, ptl); | ||
1479 | |||
1480 | if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) | ||
1481 | goto unwritable_page; | ||
1482 | |||
1483 | page_cache_release(old_page); | ||
1484 | |||
1485 | /* | ||
1486 | * Since we dropped the lock we need to revalidate | ||
1487 | * the PTE as someone else may have changed it. If | ||
1488 | * they did, we just return, as we can count on the | ||
1489 | * MMU to tell us if they didn't also make it writable. | ||
1490 | */ | ||
1491 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
1492 | &ptl); | ||
1493 | if (!pte_same(*page_table, orig_pte)) | ||
1494 | goto unlock; | ||
1466 | } | 1495 | } |
1496 | |||
1497 | reuse = 1; | ||
1498 | } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | ||
1499 | reuse = can_share_swap_page(old_page); | ||
1500 | unlock_page(old_page); | ||
1501 | } else { | ||
1502 | reuse = 0; | ||
1503 | } | ||
1504 | |||
1505 | if (reuse) { | ||
1506 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | ||
1507 | entry = pte_mkyoung(orig_pte); | ||
1508 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1509 | ptep_set_access_flags(vma, address, page_table, entry, 1); | ||
1510 | update_mmu_cache(vma, address, entry); | ||
1511 | lazy_mmu_prot_update(entry); | ||
1512 | ret |= VM_FAULT_WRITE; | ||
1513 | goto unlock; | ||
1467 | } | 1514 | } |
1468 | 1515 | ||
1469 | /* | 1516 | /* |
@@ -1523,6 +1570,10 @@ oom: | |||
1523 | if (old_page) | 1570 | if (old_page) |
1524 | page_cache_release(old_page); | 1571 | page_cache_release(old_page); |
1525 | return VM_FAULT_OOM; | 1572 | return VM_FAULT_OOM; |
1573 | |||
1574 | unwritable_page: | ||
1575 | page_cache_release(old_page); | ||
1576 | return VM_FAULT_SIGBUS; | ||
1526 | } | 1577 | } |
1527 | 1578 | ||
1528 | /* | 1579 | /* |
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1879 | goto out; | 1930 | goto out; |
1880 | 1931 | ||
1881 | entry = pte_to_swp_entry(orig_pte); | 1932 | entry = pte_to_swp_entry(orig_pte); |
1882 | again: | 1933 | if (is_migration_entry(entry)) { |
1934 | migration_entry_wait(mm, pmd, address); | ||
1935 | goto out; | ||
1936 | } | ||
1883 | page = lookup_swap_cache(entry); | 1937 | page = lookup_swap_cache(entry); |
1884 | if (!page) { | 1938 | if (!page) { |
1885 | swapin_readahead(entry, address, vma); | 1939 | swapin_readahead(entry, address, vma); |
@@ -1903,12 +1957,6 @@ again: | |||
1903 | 1957 | ||
1904 | mark_page_accessed(page); | 1958 | mark_page_accessed(page); |
1905 | lock_page(page); | 1959 | lock_page(page); |
1906 | if (!PageSwapCache(page)) { | ||
1907 | /* Page migration has occured */ | ||
1908 | unlock_page(page); | ||
1909 | page_cache_release(page); | ||
1910 | goto again; | ||
1911 | } | ||
1912 | 1960 | ||
1913 | /* | 1961 | /* |
1914 | * Back out if somebody else already faulted in this pte. | 1962 | * Back out if somebody else already faulted in this pte. |
@@ -2074,18 +2122,31 @@ retry: | |||
2074 | /* | 2122 | /* |
2075 | * Should we do an early C-O-W break? | 2123 | * Should we do an early C-O-W break? |
2076 | */ | 2124 | */ |
2077 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 2125 | if (write_access) { |
2078 | struct page *page; | 2126 | if (!(vma->vm_flags & VM_SHARED)) { |
2127 | struct page *page; | ||
2079 | 2128 | ||
2080 | if (unlikely(anon_vma_prepare(vma))) | 2129 | if (unlikely(anon_vma_prepare(vma))) |
2081 | goto oom; | 2130 | goto oom; |
2082 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 2131 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
2083 | if (!page) | 2132 | if (!page) |
2084 | goto oom; | 2133 | goto oom; |
2085 | copy_user_highpage(page, new_page, address); | 2134 | copy_user_highpage(page, new_page, address); |
2086 | page_cache_release(new_page); | 2135 | page_cache_release(new_page); |
2087 | new_page = page; | 2136 | new_page = page; |
2088 | anon = 1; | 2137 | anon = 1; |
2138 | |||
2139 | } else { | ||
2140 | /* if the page will be shareable, see if the backing | ||
2141 | * address space wants to know that the page is about | ||
2142 | * to become writable */ | ||
2143 | if (vma->vm_ops->page_mkwrite && | ||
2144 | vma->vm_ops->page_mkwrite(vma, new_page) < 0 | ||
2145 | ) { | ||
2146 | page_cache_release(new_page); | ||
2147 | return VM_FAULT_SIGBUS; | ||
2148 | } | ||
2149 | } | ||
2089 | } | 2150 | } |
2090 | 2151 | ||
2091 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2152 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 70df5c0d957e..841a077d5aeb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -26,7 +26,7 @@ | |||
26 | 26 | ||
27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | 27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, |
28 | unsigned long size); | 28 | unsigned long size); |
29 | static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) | 29 | static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) |
30 | { | 30 | { |
31 | struct pglist_data *pgdat = zone->zone_pgdat; | 31 | struct pglist_data *pgdat = zone->zone_pgdat; |
32 | int nr_pages = PAGES_PER_SECTION; | 32 | int nr_pages = PAGES_PER_SECTION; |
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
34 | int zone_type; | 34 | int zone_type; |
35 | 35 | ||
36 | zone_type = zone - pgdat->node_zones; | 36 | zone_type = zone - pgdat->node_zones; |
37 | if (!populated_zone(zone)) { | ||
38 | int ret = 0; | ||
39 | ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); | ||
40 | if (ret < 0) | ||
41 | return ret; | ||
42 | } | ||
37 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | 43 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); |
38 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | 44 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); |
45 | return 0; | ||
39 | } | 46 | } |
40 | 47 | ||
41 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 48 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | |||
50 | if (ret < 0) | 57 | if (ret < 0) |
51 | return ret; | 58 | return ret; |
52 | 59 | ||
53 | __add_zone(zone, phys_start_pfn); | 60 | ret = __add_zone(zone, phys_start_pfn); |
61 | |||
62 | if (ret < 0) | ||
63 | return ret; | ||
64 | |||
54 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 65 | return register_new_memory(__pfn_to_section(phys_start_pfn)); |
55 | } | 66 | } |
56 | 67 | ||
@@ -116,6 +127,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
116 | unsigned long flags; | 127 | unsigned long flags; |
117 | unsigned long onlined_pages = 0; | 128 | unsigned long onlined_pages = 0; |
118 | struct zone *zone; | 129 | struct zone *zone; |
130 | int need_zonelists_rebuild = 0; | ||
119 | 131 | ||
120 | /* | 132 | /* |
121 | * This doesn't need a lock to do pfn_to_page(). | 133 | * This doesn't need a lock to do pfn_to_page(). |
@@ -128,6 +140,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
128 | grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); | 140 | grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); |
129 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | 141 | pgdat_resize_unlock(zone->zone_pgdat, &flags); |
130 | 142 | ||
143 | /* | ||
144 | * If this zone is not populated, then it is not in zonelist. | ||
145 | * This means the page allocator ignores this zone. | ||
146 | * So, zonelist must be updated after online. | ||
147 | */ | ||
148 | if (!populated_zone(zone)) | ||
149 | need_zonelists_rebuild = 1; | ||
150 | |||
131 | for (i = 0; i < nr_pages; i++) { | 151 | for (i = 0; i < nr_pages; i++) { |
132 | struct page *page = pfn_to_page(pfn + i); | 152 | struct page *page = pfn_to_page(pfn + i); |
133 | online_page(page); | 153 | online_page(page); |
@@ -138,5 +158,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
138 | 158 | ||
139 | setup_per_zone_pages_min(); | 159 | setup_per_zone_pages_min(); |
140 | 160 | ||
161 | if (need_zonelists_rebuild) | ||
162 | build_all_zonelists(); | ||
163 | vm_total_pages = nr_free_pagecache_pages(); | ||
141 | return 0; | 164 | return 0; |
142 | } | 165 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8778f58880c4..ec4a1a950df9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -87,6 +87,8 @@ | |||
87 | #include <linux/seq_file.h> | 87 | #include <linux/seq_file.h> |
88 | #include <linux/proc_fs.h> | 88 | #include <linux/proc_fs.h> |
89 | #include <linux/migrate.h> | 89 | #include <linux/migrate.h> |
90 | #include <linux/rmap.h> | ||
91 | #include <linux/security.h> | ||
90 | 92 | ||
91 | #include <asm/tlbflush.h> | 93 | #include <asm/tlbflush.h> |
92 | #include <asm/uaccess.h> | 94 | #include <asm/uaccess.h> |
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
587 | isolate_lru_page(page, pagelist); | 589 | isolate_lru_page(page, pagelist); |
588 | } | 590 | } |
589 | 591 | ||
592 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | ||
593 | { | ||
594 | return alloc_pages_node(node, GFP_HIGHUSER, 0); | ||
595 | } | ||
596 | |||
590 | /* | 597 | /* |
591 | * Migrate pages from one node to a target node. | 598 | * Migrate pages from one node to a target node. |
592 | * Returns error or the number of pages not migrated. | 599 | * Returns error or the number of pages not migrated. |
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) | |||
603 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, | 610 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, |
604 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 611 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
605 | 612 | ||
606 | if (!list_empty(&pagelist)) { | 613 | if (!list_empty(&pagelist)) |
607 | err = migrate_pages_to(&pagelist, NULL, dest); | 614 | err = migrate_pages(&pagelist, new_node_page, dest); |
608 | if (!list_empty(&pagelist)) | 615 | |
609 | putback_lru_pages(&pagelist); | ||
610 | } | ||
611 | return err; | 616 | return err; |
612 | } | 617 | } |
613 | 618 | ||
@@ -694,6 +699,12 @@ int do_migrate_pages(struct mm_struct *mm, | |||
694 | 699 | ||
695 | } | 700 | } |
696 | 701 | ||
702 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) | ||
703 | { | ||
704 | struct vm_area_struct *vma = (struct vm_area_struct *)private; | ||
705 | |||
706 | return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); | ||
707 | } | ||
697 | #else | 708 | #else |
698 | 709 | ||
699 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 710 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
@@ -706,6 +717,11 @@ int do_migrate_pages(struct mm_struct *mm, | |||
706 | { | 717 | { |
707 | return -ENOSYS; | 718 | return -ENOSYS; |
708 | } | 719 | } |
720 | |||
721 | static struct page *new_vma_page(struct page *page, unsigned long private) | ||
722 | { | ||
723 | return NULL; | ||
724 | } | ||
709 | #endif | 725 | #endif |
710 | 726 | ||
711 | long do_mbind(unsigned long start, unsigned long len, | 727 | long do_mbind(unsigned long start, unsigned long len, |
@@ -767,15 +783,13 @@ long do_mbind(unsigned long start, unsigned long len, | |||
767 | err = mbind_range(vma, start, end, new); | 783 | err = mbind_range(vma, start, end, new); |
768 | 784 | ||
769 | if (!list_empty(&pagelist)) | 785 | if (!list_empty(&pagelist)) |
770 | nr_failed = migrate_pages_to(&pagelist, vma, -1); | 786 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
787 | (unsigned long)vma); | ||
771 | 788 | ||
772 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 789 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
773 | err = -EIO; | 790 | err = -EIO; |
774 | } | 791 | } |
775 | 792 | ||
776 | if (!list_empty(&pagelist)) | ||
777 | putback_lru_pages(&pagelist); | ||
778 | |||
779 | up_write(&mm->mmap_sem); | 793 | up_write(&mm->mmap_sem); |
780 | mpol_free(new); | 794 | mpol_free(new); |
781 | return err; | 795 | return err; |
@@ -929,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
929 | goto out; | 943 | goto out; |
930 | } | 944 | } |
931 | 945 | ||
946 | err = security_task_movememory(task); | ||
947 | if (err) | ||
948 | goto out; | ||
949 | |||
932 | err = do_migrate_pages(mm, &old, &new, | 950 | err = do_migrate_pages(mm, &old, &new, |
933 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | 951 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); |
934 | out: | 952 | out: |
diff --git a/mm/migrate.c b/mm/migrate.c index 1c25040693d2..1c2a71aa05cd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/migrate.h> | 15 | #include <linux/migrate.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/swapops.h> | ||
18 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
19 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
20 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
@@ -23,13 +24,13 @@ | |||
23 | #include <linux/topology.h> | 24 | #include <linux/topology.h> |
24 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
25 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
26 | #include <linux/swapops.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/mempolicy.h> | ||
29 | #include <linux/vmalloc.h> | ||
30 | #include <linux/security.h> | ||
27 | 31 | ||
28 | #include "internal.h" | 32 | #include "internal.h" |
29 | 33 | ||
30 | /* The maximum number of pages to take off the LRU for migration */ | ||
31 | #define MIGRATE_CHUNK_SIZE 256 | ||
32 | |||
33 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 34 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
34 | 35 | ||
35 | /* | 36 | /* |
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) | |||
64 | } | 65 | } |
65 | 66 | ||
66 | /* | 67 | /* |
67 | * migrate_prep() needs to be called after we have compiled the list of pages | 68 | * migrate_prep() needs to be called before we start compiling a list of pages |
68 | * to be migrated using isolate_lru_page() but before we begin a series of calls | 69 | * to be migrated using isolate_lru_page(). |
69 | * to migrate_pages(). | ||
70 | */ | 70 | */ |
71 | int migrate_prep(void) | 71 | int migrate_prep(void) |
72 | { | 72 | { |
73 | /* Must have swap device for migration */ | ||
74 | if (nr_swap_pages <= 0) | ||
75 | return -ENODEV; | ||
76 | |||
77 | /* | 73 | /* |
78 | * Clear the LRU lists so pages can be isolated. | 74 | * Clear the LRU lists so pages can be isolated. |
79 | * Note that pages may be moved off the LRU after we have | 75 | * Note that pages may be moved off the LRU after we have |
@@ -87,7 +83,6 @@ int migrate_prep(void) | |||
87 | 83 | ||
88 | static inline void move_to_lru(struct page *page) | 84 | static inline void move_to_lru(struct page *page) |
89 | { | 85 | { |
90 | list_del(&page->lru); | ||
91 | if (PageActive(page)) { | 86 | if (PageActive(page)) { |
92 | /* | 87 | /* |
93 | * lru_cache_add_active checks that | 88 | * lru_cache_add_active checks that |
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l) | |||
113 | int count = 0; | 108 | int count = 0; |
114 | 109 | ||
115 | list_for_each_entry_safe(page, page2, l, lru) { | 110 | list_for_each_entry_safe(page, page2, l, lru) { |
111 | list_del(&page->lru); | ||
116 | move_to_lru(page); | 112 | move_to_lru(page); |
117 | count++; | 113 | count++; |
118 | } | 114 | } |
119 | return count; | 115 | return count; |
120 | } | 116 | } |
121 | 117 | ||
122 | /* | 118 | static inline int is_swap_pte(pte_t pte) |
123 | * Non migratable page | ||
124 | */ | ||
125 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
126 | { | 119 | { |
127 | return -EIO; | 120 | return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); |
128 | } | 121 | } |
129 | EXPORT_SYMBOL(fail_migrate_page); | ||
130 | 122 | ||
131 | /* | 123 | /* |
132 | * swapout a single page | 124 | * Restore a potential migration pte to a working pte entry |
133 | * page is locked upon entry, unlocked on exit | ||
134 | */ | 125 | */ |
135 | static int swap_page(struct page *page) | 126 | static void remove_migration_pte(struct vm_area_struct *vma, |
127 | struct page *old, struct page *new) | ||
136 | { | 128 | { |
137 | struct address_space *mapping = page_mapping(page); | 129 | struct mm_struct *mm = vma->vm_mm; |
130 | swp_entry_t entry; | ||
131 | pgd_t *pgd; | ||
132 | pud_t *pud; | ||
133 | pmd_t *pmd; | ||
134 | pte_t *ptep, pte; | ||
135 | spinlock_t *ptl; | ||
136 | unsigned long addr = page_address_in_vma(new, vma); | ||
137 | |||
138 | if (addr == -EFAULT) | ||
139 | return; | ||
140 | |||
141 | pgd = pgd_offset(mm, addr); | ||
142 | if (!pgd_present(*pgd)) | ||
143 | return; | ||
144 | |||
145 | pud = pud_offset(pgd, addr); | ||
146 | if (!pud_present(*pud)) | ||
147 | return; | ||
148 | |||
149 | pmd = pmd_offset(pud, addr); | ||
150 | if (!pmd_present(*pmd)) | ||
151 | return; | ||
152 | |||
153 | ptep = pte_offset_map(pmd, addr); | ||
154 | |||
155 | if (!is_swap_pte(*ptep)) { | ||
156 | pte_unmap(ptep); | ||
157 | return; | ||
158 | } | ||
138 | 159 | ||
139 | if (page_mapped(page) && mapping) | 160 | ptl = pte_lockptr(mm, pmd); |
140 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | 161 | spin_lock(ptl); |
141 | goto unlock_retry; | 162 | pte = *ptep; |
163 | if (!is_swap_pte(pte)) | ||
164 | goto out; | ||
142 | 165 | ||
143 | if (PageDirty(page)) { | 166 | entry = pte_to_swp_entry(pte); |
144 | /* Page is dirty, try to write it out here */ | ||
145 | switch(pageout(page, mapping)) { | ||
146 | case PAGE_KEEP: | ||
147 | case PAGE_ACTIVATE: | ||
148 | goto unlock_retry; | ||
149 | 167 | ||
150 | case PAGE_SUCCESS: | 168 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) |
151 | goto retry; | 169 | goto out; |
152 | 170 | ||
153 | case PAGE_CLEAN: | 171 | get_page(new); |
154 | ; /* try to free the page below */ | 172 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
155 | } | 173 | if (is_write_migration_entry(entry)) |
156 | } | 174 | pte = pte_mkwrite(pte); |
175 | set_pte_at(mm, addr, ptep, pte); | ||
157 | 176 | ||
158 | if (PagePrivate(page)) { | 177 | if (PageAnon(new)) |
159 | if (!try_to_release_page(page, GFP_KERNEL) || | 178 | page_add_anon_rmap(new, vma, addr); |
160 | (!mapping && page_count(page) == 1)) | 179 | else |
161 | goto unlock_retry; | 180 | page_add_file_rmap(new); |
162 | } | ||
163 | 181 | ||
164 | if (remove_mapping(mapping, page)) { | 182 | /* No need to invalidate - it was non-present before */ |
165 | /* Success */ | 183 | update_mmu_cache(vma, addr, pte); |
166 | unlock_page(page); | 184 | lazy_mmu_prot_update(pte); |
167 | return 0; | ||
168 | } | ||
169 | 185 | ||
170 | unlock_retry: | 186 | out: |
171 | unlock_page(page); | 187 | pte_unmap_unlock(ptep, ptl); |
188 | } | ||
172 | 189 | ||
173 | retry: | 190 | /* |
174 | return -EAGAIN; | 191 | * Note that remove_file_migration_ptes will only work on regular mappings, |
192 | * Nonlinear mappings do not use migration entries. | ||
193 | */ | ||
194 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
195 | { | ||
196 | struct vm_area_struct *vma; | ||
197 | struct address_space *mapping = page_mapping(new); | ||
198 | struct prio_tree_iter iter; | ||
199 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
200 | |||
201 | if (!mapping) | ||
202 | return; | ||
203 | |||
204 | spin_lock(&mapping->i_mmap_lock); | ||
205 | |||
206 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
207 | remove_migration_pte(vma, old, new); | ||
208 | |||
209 | spin_unlock(&mapping->i_mmap_lock); | ||
175 | } | 210 | } |
176 | 211 | ||
177 | /* | 212 | /* |
178 | * Remove references for a page and establish the new page with the correct | 213 | * Must hold mmap_sem lock on at least one of the vmas containing |
179 | * basic settings to be able to stop accesses to the page. | 214 | * the page so that the anon_vma cannot vanish. |
180 | */ | 215 | */ |
181 | int migrate_page_remove_references(struct page *newpage, | 216 | static void remove_anon_migration_ptes(struct page *old, struct page *new) |
182 | struct page *page, int nr_refs) | ||
183 | { | 217 | { |
184 | struct address_space *mapping = page_mapping(page); | 218 | struct anon_vma *anon_vma; |
185 | struct page **radix_pointer; | 219 | struct vm_area_struct *vma; |
220 | unsigned long mapping; | ||
186 | 221 | ||
187 | /* | 222 | mapping = (unsigned long)new->mapping; |
188 | * Avoid doing any of the following work if the page count | ||
189 | * indicates that the page is in use or truncate has removed | ||
190 | * the page. | ||
191 | */ | ||
192 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
193 | return -EAGAIN; | ||
194 | 223 | ||
195 | /* | 224 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) |
196 | * Establish swap ptes for anonymous pages or destroy pte | 225 | return; |
197 | * maps for files. | ||
198 | * | ||
199 | * In order to reestablish file backed mappings the fault handlers | ||
200 | * will take the radix tree_lock which may then be used to stop | ||
201 | * processses from accessing this page until the new page is ready. | ||
202 | * | ||
203 | * A process accessing via a swap pte (an anonymous page) will take a | ||
204 | * page_lock on the old page which will block the process until the | ||
205 | * migration attempt is complete. At that time the PageSwapCache bit | ||
206 | * will be examined. If the page was migrated then the PageSwapCache | ||
207 | * bit will be clear and the operation to retrieve the page will be | ||
208 | * retried which will find the new page in the radix tree. Then a new | ||
209 | * direct mapping may be generated based on the radix tree contents. | ||
210 | * | ||
211 | * If the page was not migrated then the PageSwapCache bit | ||
212 | * is still set and the operation may continue. | ||
213 | */ | ||
214 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
215 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
216 | return -EPERM; | ||
217 | 226 | ||
218 | /* | 227 | /* |
219 | * Give up if we were unable to remove all mappings. | 228 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. |
220 | */ | 229 | */ |
221 | if (page_mapcount(page)) | 230 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); |
222 | return -EAGAIN; | 231 | spin_lock(&anon_vma->lock); |
232 | |||
233 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
234 | remove_migration_pte(vma, old, new); | ||
235 | |||
236 | spin_unlock(&anon_vma->lock); | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Get rid of all migration entries and replace them by | ||
241 | * references to the indicated page. | ||
242 | */ | ||
243 | static void remove_migration_ptes(struct page *old, struct page *new) | ||
244 | { | ||
245 | if (PageAnon(new)) | ||
246 | remove_anon_migration_ptes(old, new); | ||
247 | else | ||
248 | remove_file_migration_ptes(old, new); | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Something used the pte of a page under migration. We need to | ||
253 | * get to the page and wait until migration is finished. | ||
254 | * When we return from this function the fault will be retried. | ||
255 | * | ||
256 | * This function is called from do_swap_page(). | ||
257 | */ | ||
258 | void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | ||
259 | unsigned long address) | ||
260 | { | ||
261 | pte_t *ptep, pte; | ||
262 | spinlock_t *ptl; | ||
263 | swp_entry_t entry; | ||
264 | struct page *page; | ||
265 | |||
266 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
267 | pte = *ptep; | ||
268 | if (!is_swap_pte(pte)) | ||
269 | goto out; | ||
270 | |||
271 | entry = pte_to_swp_entry(pte); | ||
272 | if (!is_migration_entry(entry)) | ||
273 | goto out; | ||
274 | |||
275 | page = migration_entry_to_page(entry); | ||
276 | |||
277 | get_page(page); | ||
278 | pte_unmap_unlock(ptep, ptl); | ||
279 | wait_on_page_locked(page); | ||
280 | put_page(page); | ||
281 | return; | ||
282 | out: | ||
283 | pte_unmap_unlock(ptep, ptl); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Replace the page in the mapping. | ||
288 | * | ||
289 | * The number of remaining references must be: | ||
290 | * 1 for anonymous pages without a mapping | ||
291 | * 2 for pages with a mapping | ||
292 | * 3 for pages with a mapping and PagePrivate set. | ||
293 | */ | ||
294 | static int migrate_page_move_mapping(struct address_space *mapping, | ||
295 | struct page *newpage, struct page *page) | ||
296 | { | ||
297 | struct page **radix_pointer; | ||
298 | |||
299 | if (!mapping) { | ||
300 | /* Anonymous page */ | ||
301 | if (page_count(page) != 1) | ||
302 | return -EAGAIN; | ||
303 | return 0; | ||
304 | } | ||
223 | 305 | ||
224 | write_lock_irq(&mapping->tree_lock); | 306 | write_lock_irq(&mapping->tree_lock); |
225 | 307 | ||
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage, | |||
227 | &mapping->page_tree, | 309 | &mapping->page_tree, |
228 | page_index(page)); | 310 | page_index(page)); |
229 | 311 | ||
230 | if (!page_mapping(page) || page_count(page) != nr_refs || | 312 | if (page_count(page) != 2 + !!PagePrivate(page) || |
231 | *radix_pointer != page) { | 313 | *radix_pointer != page) { |
232 | write_unlock_irq(&mapping->tree_lock); | 314 | write_unlock_irq(&mapping->tree_lock); |
233 | return -EAGAIN; | 315 | return -EAGAIN; |
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage, | |||
235 | 317 | ||
236 | /* | 318 | /* |
237 | * Now we know that no one else is looking at the page. | 319 | * Now we know that no one else is looking at the page. |
238 | * | ||
239 | * Certain minimal information about a page must be available | ||
240 | * in order for other subsystems to properly handle the page if they | ||
241 | * find it through the radix tree update before we are finished | ||
242 | * copying the page. | ||
243 | */ | 320 | */ |
244 | get_page(newpage); | 321 | get_page(newpage); |
245 | newpage->index = page->index; | 322 | #ifdef CONFIG_SWAP |
246 | newpage->mapping = page->mapping; | ||
247 | if (PageSwapCache(page)) { | 323 | if (PageSwapCache(page)) { |
248 | SetPageSwapCache(newpage); | 324 | SetPageSwapCache(newpage); |
249 | set_page_private(newpage, page_private(page)); | 325 | set_page_private(newpage, page_private(page)); |
250 | } | 326 | } |
327 | #endif | ||
251 | 328 | ||
252 | *radix_pointer = newpage; | 329 | *radix_pointer = newpage; |
253 | __put_page(page); | 330 | __put_page(page); |
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage, | |||
255 | 332 | ||
256 | return 0; | 333 | return 0; |
257 | } | 334 | } |
258 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
259 | 335 | ||
260 | /* | 336 | /* |
261 | * Copy the page to its new location | 337 | * Copy the page to its new location |
262 | */ | 338 | */ |
263 | void migrate_page_copy(struct page *newpage, struct page *page) | 339 | static void migrate_page_copy(struct page *newpage, struct page *page) |
264 | { | 340 | { |
265 | copy_highpage(newpage, page); | 341 | copy_highpage(newpage, page); |
266 | 342 | ||
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
282 | set_page_dirty(newpage); | 358 | set_page_dirty(newpage); |
283 | } | 359 | } |
284 | 360 | ||
361 | #ifdef CONFIG_SWAP | ||
285 | ClearPageSwapCache(page); | 362 | ClearPageSwapCache(page); |
363 | #endif | ||
286 | ClearPageActive(page); | 364 | ClearPageActive(page); |
287 | ClearPagePrivate(page); | 365 | ClearPagePrivate(page); |
288 | set_page_private(page, 0); | 366 | set_page_private(page, 0); |
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
295 | if (PageWriteback(newpage)) | 373 | if (PageWriteback(newpage)) |
296 | end_page_writeback(newpage); | 374 | end_page_writeback(newpage); |
297 | } | 375 | } |
298 | EXPORT_SYMBOL(migrate_page_copy); | 376 | |
377 | /************************************************************ | ||
378 | * Migration functions | ||
379 | ***********************************************************/ | ||
380 | |||
381 | /* Always fail migration. Used for mappings that are not movable */ | ||
382 | int fail_migrate_page(struct address_space *mapping, | ||
383 | struct page *newpage, struct page *page) | ||
384 | { | ||
385 | return -EIO; | ||
386 | } | ||
387 | EXPORT_SYMBOL(fail_migrate_page); | ||
299 | 388 | ||
300 | /* | 389 | /* |
301 | * Common logic to directly migrate a single page suitable for | 390 | * Common logic to directly migrate a single page suitable for |
@@ -303,51 +392,286 @@ EXPORT_SYMBOL(migrate_page_copy); | |||
303 | * | 392 | * |
304 | * Pages are locked upon entry and exit. | 393 | * Pages are locked upon entry and exit. |
305 | */ | 394 | */ |
306 | int migrate_page(struct page *newpage, struct page *page) | 395 | int migrate_page(struct address_space *mapping, |
396 | struct page *newpage, struct page *page) | ||
307 | { | 397 | { |
308 | int rc; | 398 | int rc; |
309 | 399 | ||
310 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 400 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
311 | 401 | ||
312 | rc = migrate_page_remove_references(newpage, page, 2); | 402 | rc = migrate_page_move_mapping(mapping, newpage, page); |
403 | |||
404 | if (rc) | ||
405 | return rc; | ||
406 | |||
407 | migrate_page_copy(newpage, page); | ||
408 | return 0; | ||
409 | } | ||
410 | EXPORT_SYMBOL(migrate_page); | ||
411 | |||
412 | /* | ||
413 | * Migration function for pages with buffers. This function can only be used | ||
414 | * if the underlying filesystem guarantees that no other references to "page" | ||
415 | * exist. | ||
416 | */ | ||
417 | int buffer_migrate_page(struct address_space *mapping, | ||
418 | struct page *newpage, struct page *page) | ||
419 | { | ||
420 | struct buffer_head *bh, *head; | ||
421 | int rc; | ||
422 | |||
423 | if (!page_has_buffers(page)) | ||
424 | return migrate_page(mapping, newpage, page); | ||
425 | |||
426 | head = page_buffers(page); | ||
427 | |||
428 | rc = migrate_page_move_mapping(mapping, newpage, page); | ||
313 | 429 | ||
314 | if (rc) | 430 | if (rc) |
315 | return rc; | 431 | return rc; |
316 | 432 | ||
433 | bh = head; | ||
434 | do { | ||
435 | get_bh(bh); | ||
436 | lock_buffer(bh); | ||
437 | bh = bh->b_this_page; | ||
438 | |||
439 | } while (bh != head); | ||
440 | |||
441 | ClearPagePrivate(page); | ||
442 | set_page_private(newpage, page_private(page)); | ||
443 | set_page_private(page, 0); | ||
444 | put_page(page); | ||
445 | get_page(newpage); | ||
446 | |||
447 | bh = head; | ||
448 | do { | ||
449 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
450 | bh = bh->b_this_page; | ||
451 | |||
452 | } while (bh != head); | ||
453 | |||
454 | SetPagePrivate(newpage); | ||
455 | |||
317 | migrate_page_copy(newpage, page); | 456 | migrate_page_copy(newpage, page); |
318 | 457 | ||
458 | bh = head; | ||
459 | do { | ||
460 | unlock_buffer(bh); | ||
461 | put_bh(bh); | ||
462 | bh = bh->b_this_page; | ||
463 | |||
464 | } while (bh != head); | ||
465 | |||
466 | return 0; | ||
467 | } | ||
468 | EXPORT_SYMBOL(buffer_migrate_page); | ||
469 | |||
470 | /* | ||
471 | * Writeback a page to clean the dirty state | ||
472 | */ | ||
473 | static int writeout(struct address_space *mapping, struct page *page) | ||
474 | { | ||
475 | struct writeback_control wbc = { | ||
476 | .sync_mode = WB_SYNC_NONE, | ||
477 | .nr_to_write = 1, | ||
478 | .range_start = 0, | ||
479 | .range_end = LLONG_MAX, | ||
480 | .nonblocking = 1, | ||
481 | .for_reclaim = 1 | ||
482 | }; | ||
483 | int rc; | ||
484 | |||
485 | if (!mapping->a_ops->writepage) | ||
486 | /* No write method for the address space */ | ||
487 | return -EINVAL; | ||
488 | |||
489 | if (!clear_page_dirty_for_io(page)) | ||
490 | /* Someone else already triggered a write */ | ||
491 | return -EAGAIN; | ||
492 | |||
319 | /* | 493 | /* |
320 | * Remove auxiliary swap entries and replace | 494 | * A dirty page may imply that the underlying filesystem has |
321 | * them with real ptes. | 495 | * the page on some queue. So the page must be clean for |
322 | * | 496 | * migration. Writeout may mean we loose the lock and the |
323 | * Note that a real pte entry will allow processes that are not | 497 | * page state is no longer what we checked for earlier. |
324 | * waiting on the page lock to use the new page via the page tables | 498 | * At this point we know that the migration attempt cannot |
325 | * before the new page is unlocked. | 499 | * be successful. |
326 | */ | 500 | */ |
327 | remove_from_swap(newpage); | 501 | remove_migration_ptes(page, page); |
328 | return 0; | 502 | |
503 | rc = mapping->a_ops->writepage(page, &wbc); | ||
504 | if (rc < 0) | ||
505 | /* I/O Error writing */ | ||
506 | return -EIO; | ||
507 | |||
508 | if (rc != AOP_WRITEPAGE_ACTIVATE) | ||
509 | /* unlocked. Relock */ | ||
510 | lock_page(page); | ||
511 | |||
512 | return -EAGAIN; | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * Default handling if a filesystem does not provide a migration function. | ||
517 | */ | ||
518 | static int fallback_migrate_page(struct address_space *mapping, | ||
519 | struct page *newpage, struct page *page) | ||
520 | { | ||
521 | if (PageDirty(page)) | ||
522 | return writeout(mapping, page); | ||
523 | |||
524 | /* | ||
525 | * Buffers may be managed in a filesystem specific way. | ||
526 | * We must have no buffers or drop them. | ||
527 | */ | ||
528 | if (page_has_buffers(page) && | ||
529 | !try_to_release_page(page, GFP_KERNEL)) | ||
530 | return -EAGAIN; | ||
531 | |||
532 | return migrate_page(mapping, newpage, page); | ||
533 | } | ||
534 | |||
535 | /* | ||
536 | * Move a page to a newly allocated page | ||
537 | * The page is locked and all ptes have been successfully removed. | ||
538 | * | ||
539 | * The new page will have replaced the old page if this function | ||
540 | * is successful. | ||
541 | */ | ||
542 | static int move_to_new_page(struct page *newpage, struct page *page) | ||
543 | { | ||
544 | struct address_space *mapping; | ||
545 | int rc; | ||
546 | |||
547 | /* | ||
548 | * Block others from accessing the page when we get around to | ||
549 | * establishing additional references. We are the only one | ||
550 | * holding a reference to the new page at this point. | ||
551 | */ | ||
552 | if (TestSetPageLocked(newpage)) | ||
553 | BUG(); | ||
554 | |||
555 | /* Prepare mapping for the new page.*/ | ||
556 | newpage->index = page->index; | ||
557 | newpage->mapping = page->mapping; | ||
558 | |||
559 | mapping = page_mapping(page); | ||
560 | if (!mapping) | ||
561 | rc = migrate_page(mapping, newpage, page); | ||
562 | else if (mapping->a_ops->migratepage) | ||
563 | /* | ||
564 | * Most pages have a mapping and most filesystems | ||
565 | * should provide a migration function. Anonymous | ||
566 | * pages are part of swap space which also has its | ||
567 | * own migration function. This is the most common | ||
568 | * path for page migration. | ||
569 | */ | ||
570 | rc = mapping->a_ops->migratepage(mapping, | ||
571 | newpage, page); | ||
572 | else | ||
573 | rc = fallback_migrate_page(mapping, newpage, page); | ||
574 | |||
575 | if (!rc) | ||
576 | remove_migration_ptes(page, newpage); | ||
577 | else | ||
578 | newpage->mapping = NULL; | ||
579 | |||
580 | unlock_page(newpage); | ||
581 | |||
582 | return rc; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Obtain the lock on page, remove all ptes and migrate the page | ||
587 | * to the newly allocated page in newpage. | ||
588 | */ | ||
589 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
590 | struct page *page, int force) | ||
591 | { | ||
592 | int rc = 0; | ||
593 | int *result = NULL; | ||
594 | struct page *newpage = get_new_page(page, private, &result); | ||
595 | |||
596 | if (!newpage) | ||
597 | return -ENOMEM; | ||
598 | |||
599 | if (page_count(page) == 1) | ||
600 | /* page was freed from under us. So we are done. */ | ||
601 | goto move_newpage; | ||
602 | |||
603 | rc = -EAGAIN; | ||
604 | if (TestSetPageLocked(page)) { | ||
605 | if (!force) | ||
606 | goto move_newpage; | ||
607 | lock_page(page); | ||
608 | } | ||
609 | |||
610 | if (PageWriteback(page)) { | ||
611 | if (!force) | ||
612 | goto unlock; | ||
613 | wait_on_page_writeback(page); | ||
614 | } | ||
615 | |||
616 | /* | ||
617 | * Establish migration ptes or remove ptes | ||
618 | */ | ||
619 | if (try_to_unmap(page, 1) != SWAP_FAIL) { | ||
620 | if (!page_mapped(page)) | ||
621 | rc = move_to_new_page(newpage, page); | ||
622 | } else | ||
623 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
624 | rc = -EPERM; | ||
625 | |||
626 | if (rc) | ||
627 | remove_migration_ptes(page, page); | ||
628 | unlock: | ||
629 | unlock_page(page); | ||
630 | |||
631 | if (rc != -EAGAIN) { | ||
632 | /* | ||
633 | * A page that has been migrated has all references | ||
634 | * removed and will be freed. A page that has not been | ||
635 | * migrated will have kepts its references and be | ||
636 | * restored. | ||
637 | */ | ||
638 | list_del(&page->lru); | ||
639 | move_to_lru(page); | ||
640 | } | ||
641 | |||
642 | move_newpage: | ||
643 | /* | ||
644 | * Move the new page to the LRU. If migration was not successful | ||
645 | * then this will free the page. | ||
646 | */ | ||
647 | move_to_lru(newpage); | ||
648 | if (result) { | ||
649 | if (rc) | ||
650 | *result = rc; | ||
651 | else | ||
652 | *result = page_to_nid(newpage); | ||
653 | } | ||
654 | return rc; | ||
329 | } | 655 | } |
330 | EXPORT_SYMBOL(migrate_page); | ||
331 | 656 | ||
332 | /* | 657 | /* |
333 | * migrate_pages | 658 | * migrate_pages |
334 | * | 659 | * |
335 | * Two lists are passed to this function. The first list | 660 | * The function takes one list of pages to migrate and a function |
336 | * contains the pages isolated from the LRU to be migrated. | 661 | * that determines from the page to be migrated and the private data |
337 | * The second list contains new pages that the pages isolated | 662 | * the target of the move and allocates the page. |
338 | * can be moved to. If the second list is NULL then all | ||
339 | * pages are swapped out. | ||
340 | * | 663 | * |
341 | * The function returns after 10 attempts or if no pages | 664 | * The function returns after 10 attempts or if no pages |
342 | * are movable anymore because to has become empty | 665 | * are movable anymore because to has become empty |
343 | * or no retryable pages exist anymore. | 666 | * or no retryable pages exist anymore. All pages will be |
667 | * retruned to the LRU or freed. | ||
344 | * | 668 | * |
345 | * Return: Number of pages not migrated when "to" ran empty. | 669 | * Return: Number of pages not migrated or error code. |
346 | */ | 670 | */ |
347 | int migrate_pages(struct list_head *from, struct list_head *to, | 671 | int migrate_pages(struct list_head *from, |
348 | struct list_head *moved, struct list_head *failed) | 672 | new_page_t get_new_page, unsigned long private) |
349 | { | 673 | { |
350 | int retry; | 674 | int retry = 1; |
351 | int nr_failed = 0; | 675 | int nr_failed = 0; |
352 | int pass = 0; | 676 | int pass = 0; |
353 | struct page *page; | 677 | struct page *page; |
@@ -358,305 +682,297 @@ int migrate_pages(struct list_head *from, struct list_head *to, | |||
358 | if (!swapwrite) | 682 | if (!swapwrite) |
359 | current->flags |= PF_SWAPWRITE; | 683 | current->flags |= PF_SWAPWRITE; |
360 | 684 | ||
361 | redo: | 685 | for(pass = 0; pass < 10 && retry; pass++) { |
362 | retry = 0; | 686 | retry = 0; |
687 | |||
688 | list_for_each_entry_safe(page, page2, from, lru) { | ||
689 | cond_resched(); | ||
690 | |||
691 | rc = unmap_and_move(get_new_page, private, | ||
692 | page, pass > 2); | ||
693 | |||
694 | switch(rc) { | ||
695 | case -ENOMEM: | ||
696 | goto out; | ||
697 | case -EAGAIN: | ||
698 | retry++; | ||
699 | break; | ||
700 | case 0: | ||
701 | break; | ||
702 | default: | ||
703 | /* Permanent failure */ | ||
704 | nr_failed++; | ||
705 | break; | ||
706 | } | ||
707 | } | ||
708 | } | ||
709 | rc = 0; | ||
710 | out: | ||
711 | if (!swapwrite) | ||
712 | current->flags &= ~PF_SWAPWRITE; | ||
363 | 713 | ||
364 | list_for_each_entry_safe(page, page2, from, lru) { | 714 | putback_lru_pages(from); |
365 | struct page *newpage = NULL; | ||
366 | struct address_space *mapping; | ||
367 | 715 | ||
368 | cond_resched(); | 716 | if (rc) |
717 | return rc; | ||
369 | 718 | ||
370 | rc = 0; | 719 | return nr_failed + retry; |
371 | if (page_count(page) == 1) | 720 | } |
372 | /* page was freed from under us. So we are done. */ | ||
373 | goto next; | ||
374 | 721 | ||
375 | if (to && list_empty(to)) | 722 | #ifdef CONFIG_NUMA |
376 | break; | 723 | /* |
724 | * Move a list of individual pages | ||
725 | */ | ||
726 | struct page_to_node { | ||
727 | unsigned long addr; | ||
728 | struct page *page; | ||
729 | int node; | ||
730 | int status; | ||
731 | }; | ||
377 | 732 | ||
378 | /* | 733 | static struct page *new_page_node(struct page *p, unsigned long private, |
379 | * Skip locked pages during the first two passes to give the | 734 | int **result) |
380 | * functions holding the lock time to release the page. Later we | 735 | { |
381 | * use lock_page() to have a higher chance of acquiring the | 736 | struct page_to_node *pm = (struct page_to_node *)private; |
382 | * lock. | ||
383 | */ | ||
384 | rc = -EAGAIN; | ||
385 | if (pass > 2) | ||
386 | lock_page(page); | ||
387 | else | ||
388 | if (TestSetPageLocked(page)) | ||
389 | goto next; | ||
390 | 737 | ||
391 | /* | 738 | while (pm->node != MAX_NUMNODES && pm->page != p) |
392 | * Only wait on writeback if we have already done a pass where | 739 | pm++; |
393 | * we we may have triggered writeouts for lots of pages. | ||
394 | */ | ||
395 | if (pass > 0) { | ||
396 | wait_on_page_writeback(page); | ||
397 | } else { | ||
398 | if (PageWriteback(page)) | ||
399 | goto unlock_page; | ||
400 | } | ||
401 | 740 | ||
402 | /* | 741 | if (pm->node == MAX_NUMNODES) |
403 | * Anonymous pages must have swap cache references otherwise | 742 | return NULL; |
404 | * the information contained in the page maps cannot be | ||
405 | * preserved. | ||
406 | */ | ||
407 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
408 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
409 | rc = -ENOMEM; | ||
410 | goto unlock_page; | ||
411 | } | ||
412 | } | ||
413 | 743 | ||
414 | if (!to) { | 744 | *result = &pm->status; |
415 | rc = swap_page(page); | ||
416 | goto next; | ||
417 | } | ||
418 | 745 | ||
419 | newpage = lru_to_page(to); | 746 | return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); |
420 | lock_page(newpage); | 747 | } |
421 | 748 | ||
422 | /* | 749 | /* |
423 | * Pages are properly locked and writeback is complete. | 750 | * Move a set of pages as indicated in the pm array. The addr |
424 | * Try to migrate the page. | 751 | * field must be set to the virtual address of the page to be moved |
425 | */ | 752 | * and the node number must contain a valid target node. |
426 | mapping = page_mapping(page); | 753 | */ |
427 | if (!mapping) | 754 | static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, |
428 | goto unlock_both; | 755 | int migrate_all) |
756 | { | ||
757 | int err; | ||
758 | struct page_to_node *pp; | ||
759 | LIST_HEAD(pagelist); | ||
429 | 760 | ||
430 | if (mapping->a_ops->migratepage) { | 761 | down_read(&mm->mmap_sem); |
431 | /* | ||
432 | * Most pages have a mapping and most filesystems | ||
433 | * should provide a migration function. Anonymous | ||
434 | * pages are part of swap space which also has its | ||
435 | * own migration function. This is the most common | ||
436 | * path for page migration. | ||
437 | */ | ||
438 | rc = mapping->a_ops->migratepage(newpage, page); | ||
439 | goto unlock_both; | ||
440 | } | ||
441 | |||
442 | /* Make sure the dirty bit is up to date */ | ||
443 | if (try_to_unmap(page, 1) == SWAP_FAIL) { | ||
444 | rc = -EPERM; | ||
445 | goto unlock_both; | ||
446 | } | ||
447 | 762 | ||
448 | if (page_mapcount(page)) { | 763 | /* |
449 | rc = -EAGAIN; | 764 | * Build a list of pages to migrate |
450 | goto unlock_both; | 765 | */ |
451 | } | 766 | migrate_prep(); |
767 | for (pp = pm; pp->node != MAX_NUMNODES; pp++) { | ||
768 | struct vm_area_struct *vma; | ||
769 | struct page *page; | ||
452 | 770 | ||
453 | /* | 771 | /* |
454 | * Default handling if a filesystem does not provide | 772 | * A valid page pointer that will not match any of the |
455 | * a migration function. We can only migrate clean | 773 | * pages that will be moved. |
456 | * pages so try to write out any dirty pages first. | ||
457 | */ | 774 | */ |
458 | if (PageDirty(page)) { | 775 | pp->page = ZERO_PAGE(0); |
459 | switch (pageout(page, mapping)) { | ||
460 | case PAGE_KEEP: | ||
461 | case PAGE_ACTIVATE: | ||
462 | goto unlock_both; | ||
463 | |||
464 | case PAGE_SUCCESS: | ||
465 | unlock_page(newpage); | ||
466 | goto next; | ||
467 | |||
468 | case PAGE_CLEAN: | ||
469 | ; /* try to migrate the page below */ | ||
470 | } | ||
471 | } | ||
472 | 776 | ||
473 | /* | 777 | err = -EFAULT; |
474 | * Buffers are managed in a filesystem specific way. | 778 | vma = find_vma(mm, pp->addr); |
475 | * We must have no buffers or drop them. | 779 | if (!vma) |
476 | */ | 780 | goto set_status; |
477 | if (!page_has_buffers(page) || | ||
478 | try_to_release_page(page, GFP_KERNEL)) { | ||
479 | rc = migrate_page(newpage, page); | ||
480 | goto unlock_both; | ||
481 | } | ||
482 | 781 | ||
483 | /* | 782 | page = follow_page(vma, pp->addr, FOLL_GET); |
484 | * On early passes with mapped pages simply | 783 | err = -ENOENT; |
485 | * retry. There may be a lock held for some | 784 | if (!page) |
486 | * buffers that may go away. Later | 785 | goto set_status; |
487 | * swap them out. | 786 | |
488 | */ | 787 | if (PageReserved(page)) /* Check for zero page */ |
489 | if (pass > 4) { | 788 | goto put_and_set; |
789 | |||
790 | pp->page = page; | ||
791 | err = page_to_nid(page); | ||
792 | |||
793 | if (err == pp->node) | ||
490 | /* | 794 | /* |
491 | * Persistently unable to drop buffers..... As a | 795 | * Node already in the right place |
492 | * measure of last resort we fall back to | ||
493 | * swap_page(). | ||
494 | */ | 796 | */ |
495 | unlock_page(newpage); | 797 | goto put_and_set; |
496 | newpage = NULL; | ||
497 | rc = swap_page(page); | ||
498 | goto next; | ||
499 | } | ||
500 | 798 | ||
501 | unlock_both: | 799 | err = -EACCES; |
502 | unlock_page(newpage); | 800 | if (page_mapcount(page) > 1 && |
503 | 801 | !migrate_all) | |
504 | unlock_page: | 802 | goto put_and_set; |
505 | unlock_page(page); | 803 | |
506 | 804 | err = isolate_lru_page(page, &pagelist); | |
507 | next: | 805 | put_and_set: |
508 | if (rc == -EAGAIN) { | 806 | /* |
509 | retry++; | 807 | * Either remove the duplicate refcount from |
510 | } else if (rc) { | 808 | * isolate_lru_page() or drop the page ref if it was |
511 | /* Permanent failure */ | 809 | * not isolated. |
512 | list_move(&page->lru, failed); | 810 | */ |
513 | nr_failed++; | 811 | put_page(page); |
514 | } else { | 812 | set_status: |
515 | if (newpage) { | 813 | pp->status = err; |
516 | /* Successful migration. Return page to LRU */ | ||
517 | move_to_lru(newpage); | ||
518 | } | ||
519 | list_move(&page->lru, moved); | ||
520 | } | ||
521 | } | 814 | } |
522 | if (retry && pass++ < 10) | ||
523 | goto redo; | ||
524 | 815 | ||
525 | if (!swapwrite) | 816 | if (!list_empty(&pagelist)) |
526 | current->flags &= ~PF_SWAPWRITE; | 817 | err = migrate_pages(&pagelist, new_page_node, |
818 | (unsigned long)pm); | ||
819 | else | ||
820 | err = -ENOENT; | ||
527 | 821 | ||
528 | return nr_failed + retry; | 822 | up_read(&mm->mmap_sem); |
823 | return err; | ||
529 | } | 824 | } |
530 | 825 | ||
531 | /* | 826 | /* |
532 | * Migration function for pages with buffers. This function can only be used | 827 | * Determine the nodes of a list of pages. The addr in the pm array |
533 | * if the underlying filesystem guarantees that no other references to "page" | 828 | * must have been set to the virtual address of which we want to determine |
534 | * exist. | 829 | * the node number. |
535 | */ | 830 | */ |
536 | int buffer_migrate_page(struct page *newpage, struct page *page) | 831 | static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) |
537 | { | 832 | { |
538 | struct address_space *mapping = page->mapping; | 833 | down_read(&mm->mmap_sem); |
539 | struct buffer_head *bh, *head; | 834 | |
540 | int rc; | 835 | for ( ; pm->node != MAX_NUMNODES; pm++) { |
836 | struct vm_area_struct *vma; | ||
837 | struct page *page; | ||
838 | int err; | ||
839 | |||
840 | err = -EFAULT; | ||
841 | vma = find_vma(mm, pm->addr); | ||
842 | if (!vma) | ||
843 | goto set_status; | ||
844 | |||
845 | page = follow_page(vma, pm->addr, 0); | ||
846 | err = -ENOENT; | ||
847 | /* Use PageReserved to check for zero page */ | ||
848 | if (!page || PageReserved(page)) | ||
849 | goto set_status; | ||
850 | |||
851 | err = page_to_nid(page); | ||
852 | set_status: | ||
853 | pm->status = err; | ||
854 | } | ||
541 | 855 | ||
542 | if (!mapping) | 856 | up_read(&mm->mmap_sem); |
543 | return -EAGAIN; | 857 | return 0; |
858 | } | ||
544 | 859 | ||
545 | if (!page_has_buffers(page)) | 860 | /* |
546 | return migrate_page(newpage, page); | 861 | * Move a list of pages in the address space of the currently executing |
862 | * process. | ||
863 | */ | ||
864 | asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | ||
865 | const void __user * __user *pages, | ||
866 | const int __user *nodes, | ||
867 | int __user *status, int flags) | ||
868 | { | ||
869 | int err = 0; | ||
870 | int i; | ||
871 | struct task_struct *task; | ||
872 | nodemask_t task_nodes; | ||
873 | struct mm_struct *mm; | ||
874 | struct page_to_node *pm = NULL; | ||
547 | 875 | ||
548 | head = page_buffers(page); | 876 | /* Check flags */ |
877 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | ||
878 | return -EINVAL; | ||
549 | 879 | ||
550 | rc = migrate_page_remove_references(newpage, page, 3); | 880 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
881 | return -EPERM; | ||
551 | 882 | ||
552 | if (rc) | 883 | /* Find the mm_struct */ |
553 | return rc; | 884 | read_lock(&tasklist_lock); |
885 | task = pid ? find_task_by_pid(pid) : current; | ||
886 | if (!task) { | ||
887 | read_unlock(&tasklist_lock); | ||
888 | return -ESRCH; | ||
889 | } | ||
890 | mm = get_task_mm(task); | ||
891 | read_unlock(&tasklist_lock); | ||
554 | 892 | ||
555 | bh = head; | 893 | if (!mm) |
556 | do { | 894 | return -EINVAL; |
557 | get_bh(bh); | ||
558 | lock_buffer(bh); | ||
559 | bh = bh->b_this_page; | ||
560 | 895 | ||
561 | } while (bh != head); | 896 | /* |
897 | * Check if this process has the right to modify the specified | ||
898 | * process. The right exists if the process has administrative | ||
899 | * capabilities, superuser privileges or the same | ||
900 | * userid as the target process. | ||
901 | */ | ||
902 | if ((current->euid != task->suid) && (current->euid != task->uid) && | ||
903 | (current->uid != task->suid) && (current->uid != task->uid) && | ||
904 | !capable(CAP_SYS_NICE)) { | ||
905 | err = -EPERM; | ||
906 | goto out2; | ||
907 | } | ||
562 | 908 | ||
563 | ClearPagePrivate(page); | 909 | err = security_task_movememory(task); |
564 | set_page_private(newpage, page_private(page)); | 910 | if (err) |
565 | set_page_private(page, 0); | 911 | goto out2; |
566 | put_page(page); | ||
567 | get_page(newpage); | ||
568 | 912 | ||
569 | bh = head; | ||
570 | do { | ||
571 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
572 | bh = bh->b_this_page; | ||
573 | 913 | ||
574 | } while (bh != head); | 914 | task_nodes = cpuset_mems_allowed(task); |
575 | 915 | ||
576 | SetPagePrivate(newpage); | 916 | /* Limit nr_pages so that the multiplication may not overflow */ |
917 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
918 | err = -E2BIG; | ||
919 | goto out2; | ||
920 | } | ||
577 | 921 | ||
578 | migrate_page_copy(newpage, page); | 922 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); |
923 | if (!pm) { | ||
924 | err = -ENOMEM; | ||
925 | goto out2; | ||
926 | } | ||
579 | 927 | ||
580 | bh = head; | 928 | /* |
581 | do { | 929 | * Get parameters from user space and initialize the pm |
582 | unlock_buffer(bh); | 930 | * array. Return various errors if the user did something wrong. |
583 | put_bh(bh); | 931 | */ |
584 | bh = bh->b_this_page; | 932 | for (i = 0; i < nr_pages; i++) { |
933 | const void *p; | ||
585 | 934 | ||
586 | } while (bh != head); | 935 | err = -EFAULT; |
936 | if (get_user(p, pages + i)) | ||
937 | goto out; | ||
587 | 938 | ||
588 | return 0; | 939 | pm[i].addr = (unsigned long)p; |
589 | } | 940 | if (nodes) { |
590 | EXPORT_SYMBOL(buffer_migrate_page); | 941 | int node; |
591 | 942 | ||
592 | /* | 943 | if (get_user(node, nodes + i)) |
593 | * Migrate the list 'pagelist' of pages to a certain destination. | 944 | goto out; |
594 | * | ||
595 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
596 | * Return the number of pages not migrated or error code | ||
597 | */ | ||
598 | int migrate_pages_to(struct list_head *pagelist, | ||
599 | struct vm_area_struct *vma, int dest) | ||
600 | { | ||
601 | LIST_HEAD(newlist); | ||
602 | LIST_HEAD(moved); | ||
603 | LIST_HEAD(failed); | ||
604 | int err = 0; | ||
605 | unsigned long offset = 0; | ||
606 | int nr_pages; | ||
607 | struct page *page; | ||
608 | struct list_head *p; | ||
609 | 945 | ||
610 | redo: | 946 | err = -ENODEV; |
611 | nr_pages = 0; | 947 | if (!node_online(node)) |
612 | list_for_each(p, pagelist) { | 948 | goto out; |
613 | if (vma) { | ||
614 | /* | ||
615 | * The address passed to alloc_page_vma is used to | ||
616 | * generate the proper interleave behavior. We fake | ||
617 | * the address here by an increasing offset in order | ||
618 | * to get the proper distribution of pages. | ||
619 | * | ||
620 | * No decision has been made as to which page | ||
621 | * a certain old page is moved to so we cannot | ||
622 | * specify the correct address. | ||
623 | */ | ||
624 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
625 | offset + vma->vm_start); | ||
626 | offset += PAGE_SIZE; | ||
627 | } | ||
628 | else | ||
629 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
630 | 949 | ||
631 | if (!page) { | 950 | err = -EACCES; |
632 | err = -ENOMEM; | 951 | if (!node_isset(node, task_nodes)) |
633 | goto out; | 952 | goto out; |
953 | |||
954 | pm[i].node = node; | ||
634 | } | 955 | } |
635 | list_add_tail(&page->lru, &newlist); | ||
636 | nr_pages++; | ||
637 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
638 | break; | ||
639 | } | 956 | } |
640 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | 957 | /* End marker */ |
958 | pm[nr_pages].node = MAX_NUMNODES; | ||
959 | |||
960 | if (nodes) | ||
961 | err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
962 | else | ||
963 | err = do_pages_stat(mm, pm); | ||
641 | 964 | ||
642 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | 965 | if (err >= 0) |
966 | /* Return status information */ | ||
967 | for (i = 0; i < nr_pages; i++) | ||
968 | if (put_user(pm[i].status, status + i)) | ||
969 | err = -EFAULT; | ||
643 | 970 | ||
644 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
645 | goto redo; | ||
646 | out: | 971 | out: |
647 | /* Return leftover allocated pages */ | 972 | vfree(pm); |
648 | while (!list_empty(&newlist)) { | 973 | out2: |
649 | page = list_entry(newlist.next, struct page, lru); | 974 | mmput(mm); |
650 | list_del(&page->lru); | 975 | return err; |
651 | __free_page(page); | ||
652 | } | ||
653 | list_splice(&failed, pagelist); | ||
654 | if (err < 0) | ||
655 | return err; | ||
656 | |||
657 | /* Calculate number of leftover pages */ | ||
658 | nr_pages = 0; | ||
659 | list_for_each(p, pagelist) | ||
660 | nr_pages++; | ||
661 | return nr_pages; | ||
662 | } | 976 | } |
977 | #endif | ||
978 | |||
@@ -1065,7 +1065,8 @@ munmap_back: | |||
1065 | vma->vm_start = addr; | 1065 | vma->vm_start = addr; |
1066 | vma->vm_end = addr + len; | 1066 | vma->vm_end = addr + len; |
1067 | vma->vm_flags = vm_flags; | 1067 | vma->vm_flags = vm_flags; |
1068 | vma->vm_page_prot = protection_map[vm_flags & 0x0f]; | 1068 | vma->vm_page_prot = protection_map[vm_flags & |
1069 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
1069 | vma->vm_pgoff = pgoff; | 1070 | vma->vm_pgoff = pgoff; |
1070 | 1071 | ||
1071 | if (file) { | 1072 | if (file) { |
@@ -1089,6 +1090,12 @@ munmap_back: | |||
1089 | goto free_vma; | 1090 | goto free_vma; |
1090 | } | 1091 | } |
1091 | 1092 | ||
1093 | /* Don't make the VMA automatically writable if it's shared, but the | ||
1094 | * backer wishes to know when pages are first written to */ | ||
1095 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
1096 | vma->vm_page_prot = | ||
1097 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
1098 | |||
1092 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 1099 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform |
1093 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 1100 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) |
1094 | * that memory reservation must be checked; but that reservation | 1101 | * that memory reservation must be checked; but that reservation |
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1921 | vma->vm_end = addr + len; | 1928 | vma->vm_end = addr + len; |
1922 | vma->vm_pgoff = pgoff; | 1929 | vma->vm_pgoff = pgoff; |
1923 | vma->vm_flags = flags; | 1930 | vma->vm_flags = flags; |
1924 | vma->vm_page_prot = protection_map[flags & 0x0f]; | 1931 | vma->vm_page_prot = protection_map[flags & |
1932 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
1925 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1933 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1926 | out: | 1934 | out: |
1927 | mm->total_vm += len >> PAGE_SHIFT; | 1935 | mm->total_vm += len >> PAGE_SHIFT; |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c14d4289b61..638edabaff71 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -19,7 +19,8 @@ | |||
19 | #include <linux/mempolicy.h> | 19 | #include <linux/mempolicy.h> |
20 | #include <linux/personality.h> | 20 | #include <linux/personality.h> |
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | 22 | #include <linux/swap.h> | |
23 | #include <linux/swapops.h> | ||
23 | #include <asm/uaccess.h> | 24 | #include <asm/uaccess.h> |
24 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
25 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
@@ -28,12 +29,13 @@ | |||
28 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 29 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, |
29 | unsigned long addr, unsigned long end, pgprot_t newprot) | 30 | unsigned long addr, unsigned long end, pgprot_t newprot) |
30 | { | 31 | { |
31 | pte_t *pte; | 32 | pte_t *pte, oldpte; |
32 | spinlock_t *ptl; | 33 | spinlock_t *ptl; |
33 | 34 | ||
34 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 35 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
35 | do { | 36 | do { |
36 | if (pte_present(*pte)) { | 37 | oldpte = *pte; |
38 | if (pte_present(oldpte)) { | ||
37 | pte_t ptent; | 39 | pte_t ptent; |
38 | 40 | ||
39 | /* Avoid an SMP race with hardware updated dirty/clean | 41 | /* Avoid an SMP race with hardware updated dirty/clean |
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
43 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); | 45 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); |
44 | set_pte_at(mm, addr, pte, ptent); | 46 | set_pte_at(mm, addr, pte, ptent); |
45 | lazy_mmu_prot_update(ptent); | 47 | lazy_mmu_prot_update(ptent); |
48 | #ifdef CONFIG_MIGRATION | ||
49 | } else if (!pte_file(oldpte)) { | ||
50 | swp_entry_t entry = pte_to_swp_entry(oldpte); | ||
51 | |||
52 | if (is_write_migration_entry(entry)) { | ||
53 | /* | ||
54 | * A protection check is difficult so | ||
55 | * just be safe and disable write | ||
56 | */ | ||
57 | make_migration_entry_read(&entry); | ||
58 | set_pte_at(mm, addr, pte, | ||
59 | swp_entry_to_pte(entry)); | ||
60 | } | ||
61 | #endif | ||
46 | } | 62 | } |
63 | |||
47 | } while (pte++, addr += PAGE_SIZE, addr != end); | 64 | } while (pte++, addr += PAGE_SIZE, addr != end); |
48 | pte_unmap_unlock(pte - 1, ptl); | 65 | pte_unmap_unlock(pte - 1, ptl); |
49 | } | 66 | } |
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
106 | unsigned long oldflags = vma->vm_flags; | 123 | unsigned long oldflags = vma->vm_flags; |
107 | long nrpages = (end - start) >> PAGE_SHIFT; | 124 | long nrpages = (end - start) >> PAGE_SHIFT; |
108 | unsigned long charged = 0; | 125 | unsigned long charged = 0; |
126 | unsigned int mask; | ||
109 | pgprot_t newprot; | 127 | pgprot_t newprot; |
110 | pgoff_t pgoff; | 128 | pgoff_t pgoff; |
111 | int error; | 129 | int error; |
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
132 | } | 150 | } |
133 | } | 151 | } |
134 | 152 | ||
135 | newprot = protection_map[newflags & 0xf]; | ||
136 | |||
137 | /* | 153 | /* |
138 | * First try to merge with previous and/or next vma. | 154 | * First try to merge with previous and/or next vma. |
139 | */ | 155 | */ |
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
160 | } | 176 | } |
161 | 177 | ||
162 | success: | 178 | success: |
179 | /* Don't make the VMA automatically writable if it's shared, but the | ||
180 | * backer wishes to know when pages are first written to */ | ||
181 | mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; | ||
182 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
183 | mask &= ~VM_SHARED; | ||
184 | |||
185 | newprot = protection_map[newflags & mask]; | ||
186 | |||
163 | /* | 187 | /* |
164 | * vm_flags and vm_page_prot are protected by the mmap_sem | 188 | * vm_flags and vm_page_prot are protected by the mmap_sem |
165 | * held in write mode. | 189 | * held in write mode. |
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) | |||
205 | /* | 229 | /* |
206 | * Does the application expect PROT_READ to imply PROT_EXEC: | 230 | * Does the application expect PROT_READ to imply PROT_EXEC: |
207 | */ | 231 | */ |
208 | if (unlikely((prot & PROT_READ) && | 232 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
209 | (current->personality & READ_IMPLIES_EXEC))) | ||
210 | prot |= PROT_EXEC; | 233 | prot |= PROT_EXEC; |
211 | 234 | ||
212 | vm_flags = calc_vm_prot_bits(prot); | 235 | vm_flags = calc_vm_prot_bits(prot); |
diff --git a/mm/msync.c b/mm/msync.c index bc6c95376366..d083544df21b 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
170 | * just ignore them, but return -ENOMEM at the end. | 170 | * just ignore them, but return -ENOMEM at the end. |
171 | */ | 171 | */ |
172 | down_read(¤t->mm->mmap_sem); | 172 | down_read(¤t->mm->mmap_sem); |
173 | if (flags & MS_SYNC) | ||
174 | current->flags |= PF_SYNCWRITE; | ||
175 | vma = find_vma(current->mm, start); | 173 | vma = find_vma(current->mm, start); |
176 | if (!vma) { | 174 | if (!vma) { |
177 | error = -ENOMEM; | 175 | error = -ENOMEM; |
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
228 | } | 226 | } |
229 | } while (vma && !done); | 227 | } while (vma && !done); |
230 | out_unlock: | 228 | out_unlock: |
231 | current->flags &= ~PF_SYNCWRITE; | ||
232 | up_read(¤t->mm->mmap_sem); | 229 | up_read(¤t->mm->mmap_sem); |
233 | out: | 230 | out: |
234 | return error; | 231 | return error; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 042e6436c3ee..d46ed0f1dc06 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -22,10 +22,11 @@ | |||
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | 24 | ||
25 | int sysctl_panic_on_oom; | ||
25 | /* #define DEBUG */ | 26 | /* #define DEBUG */ |
26 | 27 | ||
27 | /** | 28 | /** |
28 | * oom_badness - calculate a numeric value for how bad this task has been | 29 | * badness - calculate a numeric value for how bad this task has been |
29 | * @p: task struct of which task we should calculate | 30 | * @p: task struct of which task we should calculate |
30 | * @uptime: current uptime in seconds | 31 | * @uptime: current uptime in seconds |
31 | * | 32 | * |
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
200 | continue; | 201 | continue; |
201 | 202 | ||
202 | /* | 203 | /* |
203 | * This is in the process of releasing memory so for wait it | 204 | * This is in the process of releasing memory so wait for it |
204 | * to finish before killing some other task by mistake. | 205 | * to finish before killing some other task by mistake. |
205 | */ | 206 | */ |
206 | releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || | 207 | releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || |
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
306 | } | 307 | } |
307 | 308 | ||
308 | /** | 309 | /** |
309 | * oom_kill - kill the "best" process when we run out of memory | 310 | * out_of_memory - kill the "best" process when we run out of memory |
310 | * | 311 | * |
311 | * If we run out of memory, we have the choice between either | 312 | * If we run out of memory, we have the choice between either |
312 | * killing a random task (bad), letting the system crash (worse) | 313 | * killing a random task (bad), letting the system crash (worse) |
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
344 | break; | 345 | break; |
345 | 346 | ||
346 | case CONSTRAINT_NONE: | 347 | case CONSTRAINT_NONE: |
348 | if (sysctl_panic_on_oom) | ||
349 | panic("out of memory. panic_on_oom is selected\n"); | ||
347 | retry: | 350 | retry: |
348 | /* | 351 | /* |
349 | * Rambo mode: Shoot down a process and hope it solves whatever | 352 | * Rambo mode: Shoot down a process and hope it solves whatever |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 75d7f48b79bb..8ccf6f1b1473 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
204 | .sync_mode = WB_SYNC_NONE, | 204 | .sync_mode = WB_SYNC_NONE, |
205 | .older_than_this = NULL, | 205 | .older_than_this = NULL, |
206 | .nr_to_write = write_chunk, | 206 | .nr_to_write = write_chunk, |
207 | .range_cyclic = 1, | ||
207 | }; | 208 | }; |
208 | 209 | ||
209 | get_dirty_limits(&wbs, &background_thresh, | 210 | get_dirty_limits(&wbs, &background_thresh, |
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages) | |||
331 | .older_than_this = NULL, | 332 | .older_than_this = NULL, |
332 | .nr_to_write = 0, | 333 | .nr_to_write = 0, |
333 | .nonblocking = 1, | 334 | .nonblocking = 1, |
335 | .range_cyclic = 1, | ||
334 | }; | 336 | }; |
335 | 337 | ||
336 | for ( ; ; ) { | 338 | for ( ; ; ) { |
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg) | |||
407 | .nr_to_write = 0, | 409 | .nr_to_write = 0, |
408 | .nonblocking = 1, | 410 | .nonblocking = 1, |
409 | .for_kupdate = 1, | 411 | .for_kupdate = 1, |
412 | .range_cyclic = 1, | ||
410 | }; | 413 | }; |
411 | 414 | ||
412 | sync_supers(); | 415 | sync_supers(); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 253a450c400d..423db0db7c02 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mempolicy.h> | 39 | #include <linux/mempolicy.h> |
40 | #include <linux/stop_machine.h> | ||
40 | 41 | ||
41 | #include <asm/tlbflush.h> | 42 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 43 | #include <asm/div64.h> |
@@ -83,8 +84,8 @@ EXPORT_SYMBOL(zone_table); | |||
83 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; | 84 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; |
84 | int min_free_kbytes = 1024; | 85 | int min_free_kbytes = 1024; |
85 | 86 | ||
86 | unsigned long __initdata nr_kernel_pages; | 87 | unsigned long __meminitdata nr_kernel_pages; |
87 | unsigned long __initdata nr_all_pages; | 88 | unsigned long __meminitdata nr_all_pages; |
88 | 89 | ||
89 | #ifdef CONFIG_DEBUG_VM | 90 | #ifdef CONFIG_DEBUG_VM |
90 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 91 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
@@ -286,22 +287,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
286 | * we can do coalesce a page and its buddy if | 287 | * we can do coalesce a page and its buddy if |
287 | * (a) the buddy is not in a hole && | 288 | * (a) the buddy is not in a hole && |
288 | * (b) the buddy is in the buddy system && | 289 | * (b) the buddy is in the buddy system && |
289 | * (c) a page and its buddy have the same order. | 290 | * (c) a page and its buddy have the same order && |
291 | * (d) a page and its buddy are in the same zone. | ||
290 | * | 292 | * |
291 | * For recording whether a page is in the buddy system, we use PG_buddy. | 293 | * For recording whether a page is in the buddy system, we use PG_buddy. |
292 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 294 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. |
293 | * | 295 | * |
294 | * For recording page's order, we use page_private(page). | 296 | * For recording page's order, we use page_private(page). |
295 | */ | 297 | */ |
296 | static inline int page_is_buddy(struct page *page, int order) | 298 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
299 | int order) | ||
297 | { | 300 | { |
298 | #ifdef CONFIG_HOLES_IN_ZONE | 301 | #ifdef CONFIG_HOLES_IN_ZONE |
299 | if (!pfn_valid(page_to_pfn(page))) | 302 | if (!pfn_valid(page_to_pfn(buddy))) |
300 | return 0; | 303 | return 0; |
301 | #endif | 304 | #endif |
302 | 305 | ||
303 | if (PageBuddy(page) && page_order(page) == order) { | 306 | if (page_zone_id(page) != page_zone_id(buddy)) |
304 | BUG_ON(page_count(page) != 0); | 307 | return 0; |
308 | |||
309 | if (PageBuddy(buddy) && page_order(buddy) == order) { | ||
310 | BUG_ON(page_count(buddy) != 0); | ||
305 | return 1; | 311 | return 1; |
306 | } | 312 | } |
307 | return 0; | 313 | return 0; |
@@ -352,7 +358,7 @@ static inline void __free_one_page(struct page *page, | |||
352 | struct page *buddy; | 358 | struct page *buddy; |
353 | 359 | ||
354 | buddy = __page_find_buddy(page, page_idx, order); | 360 | buddy = __page_find_buddy(page, page_idx, order); |
355 | if (!page_is_buddy(buddy, order)) | 361 | if (!page_is_buddy(page, buddy, order)) |
356 | break; /* Move the buddy up one level. */ | 362 | break; /* Move the buddy up one level. */ |
357 | 363 | ||
358 | list_del(&buddy->lru); | 364 | list_del(&buddy->lru); |
@@ -1485,7 +1491,7 @@ void show_free_areas(void) | |||
1485 | } | 1491 | } |
1486 | 1492 | ||
1487 | for_each_zone(zone) { | 1493 | for_each_zone(zone) { |
1488 | unsigned long nr, flags, order, total = 0; | 1494 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
1489 | 1495 | ||
1490 | show_node(zone); | 1496 | show_node(zone); |
1491 | printk("%s: ", zone->name); | 1497 | printk("%s: ", zone->name); |
@@ -1496,11 +1502,12 @@ void show_free_areas(void) | |||
1496 | 1502 | ||
1497 | spin_lock_irqsave(&zone->lock, flags); | 1503 | spin_lock_irqsave(&zone->lock, flags); |
1498 | for (order = 0; order < MAX_ORDER; order++) { | 1504 | for (order = 0; order < MAX_ORDER; order++) { |
1499 | nr = zone->free_area[order].nr_free; | 1505 | nr[order] = zone->free_area[order].nr_free; |
1500 | total += nr << order; | 1506 | total += nr[order] << order; |
1501 | printk("%lu*%lukB ", nr, K(1UL) << order); | ||
1502 | } | 1507 | } |
1503 | spin_unlock_irqrestore(&zone->lock, flags); | 1508 | spin_unlock_irqrestore(&zone->lock, flags); |
1509 | for (order = 0; order < MAX_ORDER; order++) | ||
1510 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | ||
1504 | printk("= %lukB\n", K(total)); | 1511 | printk("= %lukB\n", K(total)); |
1505 | } | 1512 | } |
1506 | 1513 | ||
@@ -1512,7 +1519,7 @@ void show_free_areas(void) | |||
1512 | * | 1519 | * |
1513 | * Add all populated zones of a node to the zonelist. | 1520 | * Add all populated zones of a node to the zonelist. |
1514 | */ | 1521 | */ |
1515 | static int __init build_zonelists_node(pg_data_t *pgdat, | 1522 | static int __meminit build_zonelists_node(pg_data_t *pgdat, |
1516 | struct zonelist *zonelist, int nr_zones, int zone_type) | 1523 | struct zonelist *zonelist, int nr_zones, int zone_type) |
1517 | { | 1524 | { |
1518 | struct zone *zone; | 1525 | struct zone *zone; |
@@ -1548,7 +1555,7 @@ static inline int highest_zone(int zone_bits) | |||
1548 | 1555 | ||
1549 | #ifdef CONFIG_NUMA | 1556 | #ifdef CONFIG_NUMA |
1550 | #define MAX_NODE_LOAD (num_online_nodes()) | 1557 | #define MAX_NODE_LOAD (num_online_nodes()) |
1551 | static int __initdata node_load[MAX_NUMNODES]; | 1558 | static int __meminitdata node_load[MAX_NUMNODES]; |
1552 | /** | 1559 | /** |
1553 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1560 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1554 | * @node: node whose fallback list we're appending | 1561 | * @node: node whose fallback list we're appending |
@@ -1563,7 +1570,7 @@ static int __initdata node_load[MAX_NUMNODES]; | |||
1563 | * on them otherwise. | 1570 | * on them otherwise. |
1564 | * It returns -1 if no node is found. | 1571 | * It returns -1 if no node is found. |
1565 | */ | 1572 | */ |
1566 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | 1573 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) |
1567 | { | 1574 | { |
1568 | int n, val; | 1575 | int n, val; |
1569 | int min_val = INT_MAX; | 1576 | int min_val = INT_MAX; |
@@ -1609,7 +1616,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1609 | return best_node; | 1616 | return best_node; |
1610 | } | 1617 | } |
1611 | 1618 | ||
1612 | static void __init build_zonelists(pg_data_t *pgdat) | 1619 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1613 | { | 1620 | { |
1614 | int i, j, k, node, local_node; | 1621 | int i, j, k, node, local_node; |
1615 | int prev_node, load; | 1622 | int prev_node, load; |
@@ -1661,7 +1668,7 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1661 | 1668 | ||
1662 | #else /* CONFIG_NUMA */ | 1669 | #else /* CONFIG_NUMA */ |
1663 | 1670 | ||
1664 | static void __init build_zonelists(pg_data_t *pgdat) | 1671 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1665 | { | 1672 | { |
1666 | int i, j, k, node, local_node; | 1673 | int i, j, k, node, local_node; |
1667 | 1674 | ||
@@ -1699,14 +1706,29 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1699 | 1706 | ||
1700 | #endif /* CONFIG_NUMA */ | 1707 | #endif /* CONFIG_NUMA */ |
1701 | 1708 | ||
1702 | void __init build_all_zonelists(void) | 1709 | /* return values int ....just for stop_machine_run() */ |
1710 | static int __meminit __build_all_zonelists(void *dummy) | ||
1703 | { | 1711 | { |
1704 | int i; | 1712 | int nid; |
1713 | for_each_online_node(nid) | ||
1714 | build_zonelists(NODE_DATA(nid)); | ||
1715 | return 0; | ||
1716 | } | ||
1705 | 1717 | ||
1706 | for_each_online_node(i) | 1718 | void __meminit build_all_zonelists(void) |
1707 | build_zonelists(NODE_DATA(i)); | 1719 | { |
1708 | printk("Built %i zonelists\n", num_online_nodes()); | 1720 | if (system_state == SYSTEM_BOOTING) { |
1709 | cpuset_init_current_mems_allowed(); | 1721 | __build_all_zonelists(0); |
1722 | cpuset_init_current_mems_allowed(); | ||
1723 | } else { | ||
1724 | /* we have to stop all cpus to guaranntee there is no user | ||
1725 | of zonelist */ | ||
1726 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | ||
1727 | /* cpuset refresh routine should be here */ | ||
1728 | } | ||
1729 | vm_total_pages = nr_free_pagecache_pages(); | ||
1730 | printk("Built %i zonelists. Total pages: %ld\n", | ||
1731 | num_online_nodes(), vm_total_pages); | ||
1710 | } | 1732 | } |
1711 | 1733 | ||
1712 | /* | 1734 | /* |
@@ -1722,7 +1744,8 @@ void __init build_all_zonelists(void) | |||
1722 | */ | 1744 | */ |
1723 | #define PAGES_PER_WAITQUEUE 256 | 1745 | #define PAGES_PER_WAITQUEUE 256 |
1724 | 1746 | ||
1725 | static inline unsigned long wait_table_size(unsigned long pages) | 1747 | #ifndef CONFIG_MEMORY_HOTPLUG |
1748 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | ||
1726 | { | 1749 | { |
1727 | unsigned long size = 1; | 1750 | unsigned long size = 1; |
1728 | 1751 | ||
@@ -1740,6 +1763,29 @@ static inline unsigned long wait_table_size(unsigned long pages) | |||
1740 | 1763 | ||
1741 | return max(size, 4UL); | 1764 | return max(size, 4UL); |
1742 | } | 1765 | } |
1766 | #else | ||
1767 | /* | ||
1768 | * A zone's size might be changed by hot-add, so it is not possible to determine | ||
1769 | * a suitable size for its wait_table. So we use the maximum size now. | ||
1770 | * | ||
1771 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: | ||
1772 | * | ||
1773 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. | ||
1774 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. | ||
1775 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. | ||
1776 | * | ||
1777 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages | ||
1778 | * or more by the traditional way. (See above). It equals: | ||
1779 | * | ||
1780 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. | ||
1781 | * ia64(16K page size) : = ( 8G + 4M)byte. | ||
1782 | * powerpc (64K page size) : = (32G +16M)byte. | ||
1783 | */ | ||
1784 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | ||
1785 | { | ||
1786 | return 4096UL; | ||
1787 | } | ||
1788 | #endif | ||
1743 | 1789 | ||
1744 | /* | 1790 | /* |
1745 | * This is an integer logarithm so that shifts can be used later | 1791 | * This is an integer logarithm so that shifts can be used later |
@@ -2005,23 +2051,46 @@ void __init setup_per_cpu_pageset(void) | |||
2005 | #endif | 2051 | #endif |
2006 | 2052 | ||
2007 | static __meminit | 2053 | static __meminit |
2008 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 2054 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
2009 | { | 2055 | { |
2010 | int i; | 2056 | int i; |
2011 | struct pglist_data *pgdat = zone->zone_pgdat; | 2057 | struct pglist_data *pgdat = zone->zone_pgdat; |
2058 | size_t alloc_size; | ||
2012 | 2059 | ||
2013 | /* | 2060 | /* |
2014 | * The per-page waitqueue mechanism uses hashed waitqueues | 2061 | * The per-page waitqueue mechanism uses hashed waitqueues |
2015 | * per zone. | 2062 | * per zone. |
2016 | */ | 2063 | */ |
2017 | zone->wait_table_size = wait_table_size(zone_size_pages); | 2064 | zone->wait_table_hash_nr_entries = |
2018 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); | 2065 | wait_table_hash_nr_entries(zone_size_pages); |
2019 | zone->wait_table = (wait_queue_head_t *) | 2066 | zone->wait_table_bits = |
2020 | alloc_bootmem_node(pgdat, zone->wait_table_size | 2067 | wait_table_bits(zone->wait_table_hash_nr_entries); |
2021 | * sizeof(wait_queue_head_t)); | 2068 | alloc_size = zone->wait_table_hash_nr_entries |
2069 | * sizeof(wait_queue_head_t); | ||
2070 | |||
2071 | if (system_state == SYSTEM_BOOTING) { | ||
2072 | zone->wait_table = (wait_queue_head_t *) | ||
2073 | alloc_bootmem_node(pgdat, alloc_size); | ||
2074 | } else { | ||
2075 | /* | ||
2076 | * This case means that a zone whose size was 0 gets new memory | ||
2077 | * via memory hot-add. | ||
2078 | * But it may be the case that a new node was hot-added. In | ||
2079 | * this case vmalloc() will not be able to use this new node's | ||
2080 | * memory - this wait_table must be initialized to use this new | ||
2081 | * node itself as well. | ||
2082 | * To use this new node's memory, further consideration will be | ||
2083 | * necessary. | ||
2084 | */ | ||
2085 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | ||
2086 | } | ||
2087 | if (!zone->wait_table) | ||
2088 | return -ENOMEM; | ||
2022 | 2089 | ||
2023 | for(i = 0; i < zone->wait_table_size; ++i) | 2090 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
2024 | init_waitqueue_head(zone->wait_table + i); | 2091 | init_waitqueue_head(zone->wait_table + i); |
2092 | |||
2093 | return 0; | ||
2025 | } | 2094 | } |
2026 | 2095 | ||
2027 | static __meminit void zone_pcp_init(struct zone *zone) | 2096 | static __meminit void zone_pcp_init(struct zone *zone) |
@@ -2043,12 +2112,15 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
2043 | zone->name, zone->present_pages, batch); | 2112 | zone->name, zone->present_pages, batch); |
2044 | } | 2113 | } |
2045 | 2114 | ||
2046 | static __meminit void init_currently_empty_zone(struct zone *zone, | 2115 | __meminit int init_currently_empty_zone(struct zone *zone, |
2047 | unsigned long zone_start_pfn, unsigned long size) | 2116 | unsigned long zone_start_pfn, |
2117 | unsigned long size) | ||
2048 | { | 2118 | { |
2049 | struct pglist_data *pgdat = zone->zone_pgdat; | 2119 | struct pglist_data *pgdat = zone->zone_pgdat; |
2050 | 2120 | int ret; | |
2051 | zone_wait_table_init(zone, size); | 2121 | ret = zone_wait_table_init(zone, size); |
2122 | if (ret) | ||
2123 | return ret; | ||
2052 | pgdat->nr_zones = zone_idx(zone) + 1; | 2124 | pgdat->nr_zones = zone_idx(zone) + 1; |
2053 | 2125 | ||
2054 | zone->zone_start_pfn = zone_start_pfn; | 2126 | zone->zone_start_pfn = zone_start_pfn; |
@@ -2056,6 +2128,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2056 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 2128 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
2057 | 2129 | ||
2058 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 2130 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
2131 | |||
2132 | return 0; | ||
2059 | } | 2133 | } |
2060 | 2134 | ||
2061 | /* | 2135 | /* |
@@ -2064,12 +2138,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2064 | * - mark all memory queues empty | 2138 | * - mark all memory queues empty |
2065 | * - clear the memory bitmaps | 2139 | * - clear the memory bitmaps |
2066 | */ | 2140 | */ |
2067 | static void __init free_area_init_core(struct pglist_data *pgdat, | 2141 | static void __meminit free_area_init_core(struct pglist_data *pgdat, |
2068 | unsigned long *zones_size, unsigned long *zholes_size) | 2142 | unsigned long *zones_size, unsigned long *zholes_size) |
2069 | { | 2143 | { |
2070 | unsigned long j; | 2144 | unsigned long j; |
2071 | int nid = pgdat->node_id; | 2145 | int nid = pgdat->node_id; |
2072 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 2146 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
2147 | int ret; | ||
2073 | 2148 | ||
2074 | pgdat_resize_init(pgdat); | 2149 | pgdat_resize_init(pgdat); |
2075 | pgdat->nr_zones = 0; | 2150 | pgdat->nr_zones = 0; |
@@ -2111,7 +2186,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
2111 | continue; | 2186 | continue; |
2112 | 2187 | ||
2113 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 2188 | zonetable_add(zone, nid, j, zone_start_pfn, size); |
2114 | init_currently_empty_zone(zone, zone_start_pfn, size); | 2189 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
2190 | BUG_ON(ret); | ||
2115 | zone_start_pfn += size; | 2191 | zone_start_pfn += size; |
2116 | } | 2192 | } |
2117 | } | 2193 | } |
@@ -2152,7 +2228,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
2152 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2228 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2153 | } | 2229 | } |
2154 | 2230 | ||
2155 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | 2231 | void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, |
2156 | unsigned long *zones_size, unsigned long node_start_pfn, | 2232 | unsigned long *zones_size, unsigned long node_start_pfn, |
2157 | unsigned long *zholes_size) | 2233 | unsigned long *zholes_size) |
2158 | { | 2234 | { |
@@ -2804,42 +2880,14 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2804 | } | 2880 | } |
2805 | 2881 | ||
2806 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | 2882 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE |
2807 | /* | ||
2808 | * pfn <-> page translation. out-of-line version. | ||
2809 | * (see asm-generic/memory_model.h) | ||
2810 | */ | ||
2811 | #if defined(CONFIG_FLATMEM) | ||
2812 | struct page *pfn_to_page(unsigned long pfn) | 2883 | struct page *pfn_to_page(unsigned long pfn) |
2813 | { | 2884 | { |
2814 | return mem_map + (pfn - ARCH_PFN_OFFSET); | 2885 | return __pfn_to_page(pfn); |
2815 | } | 2886 | } |
2816 | unsigned long page_to_pfn(struct page *page) | 2887 | unsigned long page_to_pfn(struct page *page) |
2817 | { | 2888 | { |
2818 | return (page - mem_map) + ARCH_PFN_OFFSET; | 2889 | return __page_to_pfn(page); |
2819 | } | ||
2820 | #elif defined(CONFIG_DISCONTIGMEM) | ||
2821 | struct page *pfn_to_page(unsigned long pfn) | ||
2822 | { | ||
2823 | int nid = arch_pfn_to_nid(pfn); | ||
2824 | return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); | ||
2825 | } | ||
2826 | unsigned long page_to_pfn(struct page *page) | ||
2827 | { | ||
2828 | struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); | ||
2829 | return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; | ||
2830 | } | ||
2831 | #elif defined(CONFIG_SPARSEMEM) | ||
2832 | struct page *pfn_to_page(unsigned long pfn) | ||
2833 | { | ||
2834 | return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; | ||
2835 | } | ||
2836 | |||
2837 | unsigned long page_to_pfn(struct page *page) | ||
2838 | { | ||
2839 | long section_id = page_to_section(page); | ||
2840 | return page - __section_mem_map_addr(__nr_to_section(section_id)); | ||
2841 | } | 2890 | } |
2842 | #endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ | ||
2843 | EXPORT_SYMBOL(pfn_to_page); | 2891 | EXPORT_SYMBOL(pfn_to_page); |
2844 | EXPORT_SYMBOL(page_to_pfn); | 2892 | EXPORT_SYMBOL(page_to_pfn); |
2845 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 2893 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
diff --git a/mm/pdflush.c b/mm/pdflush.c index c4b6d0afd736..df7e50b8f70c 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) | |||
202 | unsigned long flags; | 202 | unsigned long flags; |
203 | int ret = 0; | 203 | int ret = 0; |
204 | 204 | ||
205 | if (fn == NULL) | 205 | BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ |
206 | BUG(); /* Hard to diagnose if it's deferred */ | ||
207 | 206 | ||
208 | spin_lock_irqsave(&pdflush_lock, flags); | 207 | spin_lock_irqsave(&pdflush_lock, flags); |
209 | if (list_empty(&pdflush_list)) { | 208 | if (list_empty(&pdflush_list)) { |
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
103 | spin_lock(&mm->page_table_lock); | 103 | spin_lock(&mm->page_table_lock); |
104 | if (likely(!vma->anon_vma)) { | 104 | if (likely(!vma->anon_vma)) { |
105 | vma->anon_vma = anon_vma; | 105 | vma->anon_vma = anon_vma; |
106 | list_add(&vma->anon_vma_node, &anon_vma->head); | 106 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
107 | allocated = NULL; | 107 | allocated = NULL; |
108 | } | 108 | } |
109 | spin_unlock(&mm->page_table_lock); | 109 | spin_unlock(&mm->page_table_lock); |
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma) | |||
127 | struct anon_vma *anon_vma = vma->anon_vma; | 127 | struct anon_vma *anon_vma = vma->anon_vma; |
128 | 128 | ||
129 | if (anon_vma) { | 129 | if (anon_vma) { |
130 | list_add(&vma->anon_vma_node, &anon_vma->head); | 130 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
131 | validate_anon_vma(vma); | 131 | validate_anon_vma(vma); |
132 | } | 132 | } |
133 | } | 133 | } |
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma) | |||
138 | 138 | ||
139 | if (anon_vma) { | 139 | if (anon_vma) { |
140 | spin_lock(&anon_vma->lock); | 140 | spin_lock(&anon_vma->lock); |
141 | list_add(&vma->anon_vma_node, &anon_vma->head); | 141 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
142 | validate_anon_vma(vma); | 142 | validate_anon_vma(vma); |
143 | spin_unlock(&anon_vma->lock); | 143 | spin_unlock(&anon_vma->lock); |
144 | } | 144 | } |
@@ -205,44 +205,6 @@ out: | |||
205 | return anon_vma; | 205 | return anon_vma; |
206 | } | 206 | } |
207 | 207 | ||
208 | #ifdef CONFIG_MIGRATION | ||
209 | /* | ||
210 | * Remove an anonymous page from swap replacing the swap pte's | ||
211 | * through real pte's pointing to valid pages and then releasing | ||
212 | * the page from the swap cache. | ||
213 | * | ||
214 | * Must hold page lock on page and mmap_sem of one vma that contains | ||
215 | * the page. | ||
216 | */ | ||
217 | void remove_from_swap(struct page *page) | ||
218 | { | ||
219 | struct anon_vma *anon_vma; | ||
220 | struct vm_area_struct *vma; | ||
221 | unsigned long mapping; | ||
222 | |||
223 | if (!PageSwapCache(page)) | ||
224 | return; | ||
225 | |||
226 | mapping = (unsigned long)page->mapping; | ||
227 | |||
228 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
229 | return; | ||
230 | |||
231 | /* | ||
232 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
233 | */ | ||
234 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
235 | spin_lock(&anon_vma->lock); | ||
236 | |||
237 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
238 | remove_vma_swap(vma, page); | ||
239 | |||
240 | spin_unlock(&anon_vma->lock); | ||
241 | delete_from_swap_cache(page); | ||
242 | } | ||
243 | EXPORT_SYMBOL(remove_from_swap); | ||
244 | #endif | ||
245 | |||
246 | /* | 208 | /* |
247 | * At what user virtual address is page expected in vma? | 209 | * At what user virtual address is page expected in vma? |
248 | */ | 210 | */ |
@@ -578,7 +540,7 @@ void page_remove_rmap(struct page *page) | |||
578 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 540 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
579 | */ | 541 | */ |
580 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 542 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
581 | int ignore_refs) | 543 | int migration) |
582 | { | 544 | { |
583 | struct mm_struct *mm = vma->vm_mm; | 545 | struct mm_struct *mm = vma->vm_mm; |
584 | unsigned long address; | 546 | unsigned long address; |
@@ -602,7 +564,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
602 | */ | 564 | */ |
603 | if ((vma->vm_flags & VM_LOCKED) || | 565 | if ((vma->vm_flags & VM_LOCKED) || |
604 | (ptep_clear_flush_young(vma, address, pte) | 566 | (ptep_clear_flush_young(vma, address, pte) |
605 | && !ignore_refs)) { | 567 | && !migration)) { |
606 | ret = SWAP_FAIL; | 568 | ret = SWAP_FAIL; |
607 | goto out_unmap; | 569 | goto out_unmap; |
608 | } | 570 | } |
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
620 | 582 | ||
621 | if (PageAnon(page)) { | 583 | if (PageAnon(page)) { |
622 | swp_entry_t entry = { .val = page_private(page) }; | 584 | swp_entry_t entry = { .val = page_private(page) }; |
623 | /* | 585 | |
624 | * Store the swap location in the pte. | 586 | if (PageSwapCache(page)) { |
625 | * See handle_pte_fault() ... | 587 | /* |
626 | */ | 588 | * Store the swap location in the pte. |
627 | BUG_ON(!PageSwapCache(page)); | 589 | * See handle_pte_fault() ... |
628 | swap_duplicate(entry); | 590 | */ |
629 | if (list_empty(&mm->mmlist)) { | 591 | swap_duplicate(entry); |
630 | spin_lock(&mmlist_lock); | 592 | if (list_empty(&mm->mmlist)) { |
631 | if (list_empty(&mm->mmlist)) | 593 | spin_lock(&mmlist_lock); |
632 | list_add(&mm->mmlist, &init_mm.mmlist); | 594 | if (list_empty(&mm->mmlist)) |
633 | spin_unlock(&mmlist_lock); | 595 | list_add(&mm->mmlist, &init_mm.mmlist); |
596 | spin_unlock(&mmlist_lock); | ||
597 | } | ||
598 | dec_mm_counter(mm, anon_rss); | ||
599 | #ifdef CONFIG_MIGRATION | ||
600 | } else { | ||
601 | /* | ||
602 | * Store the pfn of the page in a special migration | ||
603 | * pte. do_swap_page() will wait until the migration | ||
604 | * pte is removed and then restart fault handling. | ||
605 | */ | ||
606 | BUG_ON(!migration); | ||
607 | entry = make_migration_entry(page, pte_write(pteval)); | ||
608 | #endif | ||
634 | } | 609 | } |
635 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 610 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
636 | BUG_ON(pte_file(*pte)); | 611 | BUG_ON(pte_file(*pte)); |
637 | dec_mm_counter(mm, anon_rss); | ||
638 | } else | 612 | } else |
613 | #ifdef CONFIG_MIGRATION | ||
614 | if (migration) { | ||
615 | /* Establish migration entry for a file page */ | ||
616 | swp_entry_t entry; | ||
617 | entry = make_migration_entry(page, pte_write(pteval)); | ||
618 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | ||
619 | } else | ||
620 | #endif | ||
639 | dec_mm_counter(mm, file_rss); | 621 | dec_mm_counter(mm, file_rss); |
640 | 622 | ||
623 | |||
641 | page_remove_rmap(page); | 624 | page_remove_rmap(page); |
642 | page_cache_release(page); | 625 | page_cache_release(page); |
643 | 626 | ||
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
736 | pte_unmap_unlock(pte - 1, ptl); | 719 | pte_unmap_unlock(pte - 1, ptl); |
737 | } | 720 | } |
738 | 721 | ||
739 | static int try_to_unmap_anon(struct page *page, int ignore_refs) | 722 | static int try_to_unmap_anon(struct page *page, int migration) |
740 | { | 723 | { |
741 | struct anon_vma *anon_vma; | 724 | struct anon_vma *anon_vma; |
742 | struct vm_area_struct *vma; | 725 | struct vm_area_struct *vma; |
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs) | |||
747 | return ret; | 730 | return ret; |
748 | 731 | ||
749 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 732 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
750 | ret = try_to_unmap_one(page, vma, ignore_refs); | 733 | ret = try_to_unmap_one(page, vma, migration); |
751 | if (ret == SWAP_FAIL || !page_mapped(page)) | 734 | if (ret == SWAP_FAIL || !page_mapped(page)) |
752 | break; | 735 | break; |
753 | } | 736 | } |
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs) | |||
764 | * | 747 | * |
765 | * This function is only called from try_to_unmap for object-based pages. | 748 | * This function is only called from try_to_unmap for object-based pages. |
766 | */ | 749 | */ |
767 | static int try_to_unmap_file(struct page *page, int ignore_refs) | 750 | static int try_to_unmap_file(struct page *page, int migration) |
768 | { | 751 | { |
769 | struct address_space *mapping = page->mapping; | 752 | struct address_space *mapping = page->mapping; |
770 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 753 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs) | |||
778 | 761 | ||
779 | spin_lock(&mapping->i_mmap_lock); | 762 | spin_lock(&mapping->i_mmap_lock); |
780 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 763 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
781 | ret = try_to_unmap_one(page, vma, ignore_refs); | 764 | ret = try_to_unmap_one(page, vma, migration); |
782 | if (ret == SWAP_FAIL || !page_mapped(page)) | 765 | if (ret == SWAP_FAIL || !page_mapped(page)) |
783 | goto out; | 766 | goto out; |
784 | } | 767 | } |
@@ -863,16 +846,16 @@ out: | |||
863 | * SWAP_AGAIN - we missed a mapping, try again later | 846 | * SWAP_AGAIN - we missed a mapping, try again later |
864 | * SWAP_FAIL - the page is unswappable | 847 | * SWAP_FAIL - the page is unswappable |
865 | */ | 848 | */ |
866 | int try_to_unmap(struct page *page, int ignore_refs) | 849 | int try_to_unmap(struct page *page, int migration) |
867 | { | 850 | { |
868 | int ret; | 851 | int ret; |
869 | 852 | ||
870 | BUG_ON(!PageLocked(page)); | 853 | BUG_ON(!PageLocked(page)); |
871 | 854 | ||
872 | if (PageAnon(page)) | 855 | if (PageAnon(page)) |
873 | ret = try_to_unmap_anon(page, ignore_refs); | 856 | ret = try_to_unmap_anon(page, migration); |
874 | else | 857 | else |
875 | ret = try_to_unmap_file(page, ignore_refs); | 858 | ret = try_to_unmap_file(page, migration); |
876 | 859 | ||
877 | if (!page_mapped(page)) | 860 | if (!page_mapped(page)) |
878 | ret = SWAP_SUCCESS; | 861 | ret = SWAP_SUCCESS; |
diff --git a/mm/shmem.c b/mm/shmem.c index 797eef3805ce..38bc3334f263 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1081,14 +1081,6 @@ repeat: | |||
1081 | page_cache_release(swappage); | 1081 | page_cache_release(swappage); |
1082 | goto repeat; | 1082 | goto repeat; |
1083 | } | 1083 | } |
1084 | if (!PageSwapCache(swappage)) { | ||
1085 | /* Page migration has occured */ | ||
1086 | shmem_swp_unmap(entry); | ||
1087 | spin_unlock(&info->lock); | ||
1088 | unlock_page(swappage); | ||
1089 | page_cache_release(swappage); | ||
1090 | goto repeat; | ||
1091 | } | ||
1092 | if (PageWriteback(swappage)) { | 1084 | if (PageWriteback(swappage)) { |
1093 | shmem_swp_unmap(entry); | 1085 | shmem_swp_unmap(entry); |
1094 | spin_unlock(&info->lock); | 1086 | spin_unlock(&info->lock); |
@@ -1654,9 +1646,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, | |||
1654 | return desc.error; | 1646 | return desc.error; |
1655 | } | 1647 | } |
1656 | 1648 | ||
1657 | static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) | 1649 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1658 | { | 1650 | { |
1659 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 1651 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
1660 | 1652 | ||
1661 | buf->f_type = TMPFS_MAGIC; | 1653 | buf->f_type = TMPFS_MAGIC; |
1662 | buf->f_bsize = PAGE_CACHE_SIZE; | 1654 | buf->f_bsize = PAGE_CACHE_SIZE; |
@@ -2233,10 +2225,10 @@ static struct vm_operations_struct shmem_vm_ops = { | |||
2233 | }; | 2225 | }; |
2234 | 2226 | ||
2235 | 2227 | ||
2236 | static struct super_block *shmem_get_sb(struct file_system_type *fs_type, | 2228 | static int shmem_get_sb(struct file_system_type *fs_type, |
2237 | int flags, const char *dev_name, void *data) | 2229 | int flags, const char *dev_name, void *data, struct vfsmount *mnt) |
2238 | { | 2230 | { |
2239 | return get_sb_nodev(fs_type, flags, data, shmem_fill_super); | 2231 | return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); |
2240 | } | 2232 | } |
2241 | 2233 | ||
2242 | static struct file_system_type tmpfs_fs_type = { | 2234 | static struct file_system_type tmpfs_fs_type = { |
@@ -331,6 +331,8 @@ static __always_inline int index_of(const size_t size) | |||
331 | return 0; | 331 | return 0; |
332 | } | 332 | } |
333 | 333 | ||
334 | static int slab_early_init = 1; | ||
335 | |||
334 | #define INDEX_AC index_of(sizeof(struct arraycache_init)) | 336 | #define INDEX_AC index_of(sizeof(struct arraycache_init)) |
335 | #define INDEX_L3 index_of(sizeof(struct kmem_list3)) | 337 | #define INDEX_L3 index_of(sizeof(struct kmem_list3)) |
336 | 338 | ||
@@ -592,6 +594,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page) | |||
592 | { | 594 | { |
593 | if (unlikely(PageCompound(page))) | 595 | if (unlikely(PageCompound(page))) |
594 | page = (struct page *)page_private(page); | 596 | page = (struct page *)page_private(page); |
597 | BUG_ON(!PageSlab(page)); | ||
595 | return (struct kmem_cache *)page->lru.next; | 598 | return (struct kmem_cache *)page->lru.next; |
596 | } | 599 | } |
597 | 600 | ||
@@ -604,6 +607,7 @@ static inline struct slab *page_get_slab(struct page *page) | |||
604 | { | 607 | { |
605 | if (unlikely(PageCompound(page))) | 608 | if (unlikely(PageCompound(page))) |
606 | page = (struct page *)page_private(page); | 609 | page = (struct page *)page_private(page); |
610 | BUG_ON(!PageSlab(page)); | ||
607 | return (struct slab *)page->lru.prev; | 611 | return (struct slab *)page->lru.prev; |
608 | } | 612 | } |
609 | 613 | ||
@@ -1024,6 +1028,40 @@ static void drain_alien_cache(struct kmem_cache *cachep, | |||
1024 | } | 1028 | } |
1025 | } | 1029 | } |
1026 | } | 1030 | } |
1031 | |||
1032 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1033 | { | ||
1034 | struct slab *slabp = virt_to_slab(objp); | ||
1035 | int nodeid = slabp->nodeid; | ||
1036 | struct kmem_list3 *l3; | ||
1037 | struct array_cache *alien = NULL; | ||
1038 | |||
1039 | /* | ||
1040 | * Make sure we are not freeing a object from another node to the array | ||
1041 | * cache on this cpu. | ||
1042 | */ | ||
1043 | if (likely(slabp->nodeid == numa_node_id())) | ||
1044 | return 0; | ||
1045 | |||
1046 | l3 = cachep->nodelists[numa_node_id()]; | ||
1047 | STATS_INC_NODEFREES(cachep); | ||
1048 | if (l3->alien && l3->alien[nodeid]) { | ||
1049 | alien = l3->alien[nodeid]; | ||
1050 | spin_lock(&alien->lock); | ||
1051 | if (unlikely(alien->avail == alien->limit)) { | ||
1052 | STATS_INC_ACOVERFLOW(cachep); | ||
1053 | __drain_alien_cache(cachep, alien, nodeid); | ||
1054 | } | ||
1055 | alien->entry[alien->avail++] = objp; | ||
1056 | spin_unlock(&alien->lock); | ||
1057 | } else { | ||
1058 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); | ||
1059 | free_block(cachep, &objp, 1, nodeid); | ||
1060 | spin_unlock(&(cachep->nodelists[nodeid])->list_lock); | ||
1061 | } | ||
1062 | return 1; | ||
1063 | } | ||
1064 | |||
1027 | #else | 1065 | #else |
1028 | 1066 | ||
1029 | #define drain_alien_cache(cachep, alien) do { } while (0) | 1067 | #define drain_alien_cache(cachep, alien) do { } while (0) |
@@ -1038,6 +1076,11 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) | |||
1038 | { | 1076 | { |
1039 | } | 1077 | } |
1040 | 1078 | ||
1079 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1080 | { | ||
1081 | return 0; | ||
1082 | } | ||
1083 | |||
1041 | #endif | 1084 | #endif |
1042 | 1085 | ||
1043 | static int cpuup_callback(struct notifier_block *nfb, | 1086 | static int cpuup_callback(struct notifier_block *nfb, |
@@ -1335,6 +1378,8 @@ void __init kmem_cache_init(void) | |||
1335 | NULL, NULL); | 1378 | NULL, NULL); |
1336 | } | 1379 | } |
1337 | 1380 | ||
1381 | slab_early_init = 0; | ||
1382 | |||
1338 | while (sizes->cs_size != ULONG_MAX) { | 1383 | while (sizes->cs_size != ULONG_MAX) { |
1339 | /* | 1384 | /* |
1340 | * For performance, all the general caches are L1 aligned. | 1385 | * For performance, all the general caches are L1 aligned. |
@@ -1450,31 +1495,29 @@ __initcall(cpucache_init); | |||
1450 | static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 1495 | static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
1451 | { | 1496 | { |
1452 | struct page *page; | 1497 | struct page *page; |
1453 | void *addr; | 1498 | int nr_pages; |
1454 | int i; | 1499 | int i; |
1455 | 1500 | ||
1456 | flags |= cachep->gfpflags; | ||
1457 | #ifndef CONFIG_MMU | 1501 | #ifndef CONFIG_MMU |
1458 | /* nommu uses slab's for process anonymous memory allocations, so | 1502 | /* |
1459 | * requires __GFP_COMP to properly refcount higher order allocations" | 1503 | * Nommu uses slab's for process anonymous memory allocations, and thus |
1504 | * requires __GFP_COMP to properly refcount higher order allocations | ||
1460 | */ | 1505 | */ |
1461 | page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder); | 1506 | flags |= __GFP_COMP; |
1462 | #else | ||
1463 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | ||
1464 | #endif | 1507 | #endif |
1508 | flags |= cachep->gfpflags; | ||
1509 | |||
1510 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | ||
1465 | if (!page) | 1511 | if (!page) |
1466 | return NULL; | 1512 | return NULL; |
1467 | addr = page_address(page); | ||
1468 | 1513 | ||
1469 | i = (1 << cachep->gfporder); | 1514 | nr_pages = (1 << cachep->gfporder); |
1470 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1515 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1471 | atomic_add(i, &slab_reclaim_pages); | 1516 | atomic_add(nr_pages, &slab_reclaim_pages); |
1472 | add_page_state(nr_slab, i); | 1517 | add_page_state(nr_slab, nr_pages); |
1473 | while (i--) { | 1518 | for (i = 0; i < nr_pages; i++) |
1474 | __SetPageSlab(page); | 1519 | __SetPageSlab(page + i); |
1475 | page++; | 1520 | return page_address(page); |
1476 | } | ||
1477 | return addr; | ||
1478 | } | 1521 | } |
1479 | 1522 | ||
1480 | /* | 1523 | /* |
@@ -1913,8 +1956,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1913 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) | 1956 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) |
1914 | { | 1957 | { |
1915 | size_t left_over, slab_size, ralign; | 1958 | size_t left_over, slab_size, ralign; |
1916 | struct kmem_cache *cachep = NULL; | 1959 | struct kmem_cache *cachep = NULL, *pc; |
1917 | struct list_head *p; | ||
1918 | 1960 | ||
1919 | /* | 1961 | /* |
1920 | * Sanity checks... these are all serious usage bugs. | 1962 | * Sanity checks... these are all serious usage bugs. |
@@ -1934,8 +1976,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1934 | 1976 | ||
1935 | mutex_lock(&cache_chain_mutex); | 1977 | mutex_lock(&cache_chain_mutex); |
1936 | 1978 | ||
1937 | list_for_each(p, &cache_chain) { | 1979 | list_for_each_entry(pc, &cache_chain, next) { |
1938 | struct kmem_cache *pc = list_entry(p, struct kmem_cache, next); | ||
1939 | mm_segment_t old_fs = get_fs(); | 1980 | mm_segment_t old_fs = get_fs(); |
1940 | char tmp; | 1981 | char tmp; |
1941 | int res; | 1982 | int res; |
@@ -2069,8 +2110,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2069 | #endif | 2110 | #endif |
2070 | #endif | 2111 | #endif |
2071 | 2112 | ||
2072 | /* Determine if the slab management is 'on' or 'off' slab. */ | 2113 | /* |
2073 | if (size >= (PAGE_SIZE >> 3)) | 2114 | * Determine if the slab management is 'on' or 'off' slab. |
2115 | * (bootstrapping cannot cope with offslab caches so don't do | ||
2116 | * it too early on.) | ||
2117 | */ | ||
2118 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | ||
2074 | /* | 2119 | /* |
2075 | * Size is large, assume best to place the slab management obj | 2120 | * Size is large, assume best to place the slab management obj |
2076 | * off-slab (should allow better packing of objs). | 2121 | * off-slab (should allow better packing of objs). |
@@ -2460,23 +2505,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, | |||
2460 | slabp->inuse--; | 2505 | slabp->inuse--; |
2461 | } | 2506 | } |
2462 | 2507 | ||
2463 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, | 2508 | /* |
2464 | void *objp) | 2509 | * Map pages beginning at addr to the given cache and slab. This is required |
2510 | * for the slab allocator to be able to lookup the cache and slab of a | ||
2511 | * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. | ||
2512 | */ | ||
2513 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | ||
2514 | void *addr) | ||
2465 | { | 2515 | { |
2466 | int i; | 2516 | int nr_pages; |
2467 | struct page *page; | 2517 | struct page *page; |
2468 | 2518 | ||
2469 | /* Nasty!!!!!! I hope this is OK. */ | 2519 | page = virt_to_page(addr); |
2470 | page = virt_to_page(objp); | ||
2471 | 2520 | ||
2472 | i = 1; | 2521 | nr_pages = 1; |
2473 | if (likely(!PageCompound(page))) | 2522 | if (likely(!PageCompound(page))) |
2474 | i <<= cachep->gfporder; | 2523 | nr_pages <<= cache->gfporder; |
2524 | |||
2475 | do { | 2525 | do { |
2476 | page_set_cache(page, cachep); | 2526 | page_set_cache(page, cache); |
2477 | page_set_slab(page, slabp); | 2527 | page_set_slab(page, slab); |
2478 | page++; | 2528 | page++; |
2479 | } while (--i); | 2529 | } while (--nr_pages); |
2480 | } | 2530 | } |
2481 | 2531 | ||
2482 | /* | 2532 | /* |
@@ -2548,7 +2598,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2548 | goto opps1; | 2598 | goto opps1; |
2549 | 2599 | ||
2550 | slabp->nodeid = nodeid; | 2600 | slabp->nodeid = nodeid; |
2551 | set_slab_attr(cachep, slabp, objp); | 2601 | slab_map_pages(cachep, slabp, objp); |
2552 | 2602 | ||
2553 | cache_init_objs(cachep, slabp, ctor_flags); | 2603 | cache_init_objs(cachep, slabp, ctor_flags); |
2554 | 2604 | ||
@@ -2596,6 +2646,28 @@ static void kfree_debugcheck(const void *objp) | |||
2596 | } | 2646 | } |
2597 | } | 2647 | } |
2598 | 2648 | ||
2649 | static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) | ||
2650 | { | ||
2651 | unsigned long redzone1, redzone2; | ||
2652 | |||
2653 | redzone1 = *dbg_redzone1(cache, obj); | ||
2654 | redzone2 = *dbg_redzone2(cache, obj); | ||
2655 | |||
2656 | /* | ||
2657 | * Redzone is ok. | ||
2658 | */ | ||
2659 | if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) | ||
2660 | return; | ||
2661 | |||
2662 | if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) | ||
2663 | slab_error(cache, "double free detected"); | ||
2664 | else | ||
2665 | slab_error(cache, "memory outside object was overwritten"); | ||
2666 | |||
2667 | printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", | ||
2668 | obj, redzone1, redzone2); | ||
2669 | } | ||
2670 | |||
2599 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | 2671 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, |
2600 | void *caller) | 2672 | void *caller) |
2601 | { | 2673 | { |
@@ -2607,27 +2679,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2607 | kfree_debugcheck(objp); | 2679 | kfree_debugcheck(objp); |
2608 | page = virt_to_page(objp); | 2680 | page = virt_to_page(objp); |
2609 | 2681 | ||
2610 | if (page_get_cache(page) != cachep) { | ||
2611 | printk(KERN_ERR "mismatch in kmem_cache_free: expected " | ||
2612 | "cache %p, got %p\n", | ||
2613 | page_get_cache(page), cachep); | ||
2614 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | ||
2615 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), | ||
2616 | page_get_cache(page)->name); | ||
2617 | WARN_ON(1); | ||
2618 | } | ||
2619 | slabp = page_get_slab(page); | 2682 | slabp = page_get_slab(page); |
2620 | 2683 | ||
2621 | if (cachep->flags & SLAB_RED_ZONE) { | 2684 | if (cachep->flags & SLAB_RED_ZONE) { |
2622 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || | 2685 | verify_redzone_free(cachep, objp); |
2623 | *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | ||
2624 | slab_error(cachep, "double free, or memory outside" | ||
2625 | " object was overwritten"); | ||
2626 | printk(KERN_ERR "%p: redzone 1:0x%lx, " | ||
2627 | "redzone 2:0x%lx.\n", | ||
2628 | objp, *dbg_redzone1(cachep, objp), | ||
2629 | *dbg_redzone2(cachep, objp)); | ||
2630 | } | ||
2631 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | 2686 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; |
2632 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2687 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2633 | } | 2688 | } |
@@ -3087,41 +3142,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3087 | check_irq_off(); | 3142 | check_irq_off(); |
3088 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3143 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
3089 | 3144 | ||
3090 | /* Make sure we are not freeing a object from another | 3145 | if (cache_free_alien(cachep, objp)) |
3091 | * node to the array cache on this cpu. | 3146 | return; |
3092 | */ | 3147 | |
3093 | #ifdef CONFIG_NUMA | ||
3094 | { | ||
3095 | struct slab *slabp; | ||
3096 | slabp = virt_to_slab(objp); | ||
3097 | if (unlikely(slabp->nodeid != numa_node_id())) { | ||
3098 | struct array_cache *alien = NULL; | ||
3099 | int nodeid = slabp->nodeid; | ||
3100 | struct kmem_list3 *l3; | ||
3101 | |||
3102 | l3 = cachep->nodelists[numa_node_id()]; | ||
3103 | STATS_INC_NODEFREES(cachep); | ||
3104 | if (l3->alien && l3->alien[nodeid]) { | ||
3105 | alien = l3->alien[nodeid]; | ||
3106 | spin_lock(&alien->lock); | ||
3107 | if (unlikely(alien->avail == alien->limit)) { | ||
3108 | STATS_INC_ACOVERFLOW(cachep); | ||
3109 | __drain_alien_cache(cachep, | ||
3110 | alien, nodeid); | ||
3111 | } | ||
3112 | alien->entry[alien->avail++] = objp; | ||
3113 | spin_unlock(&alien->lock); | ||
3114 | } else { | ||
3115 | spin_lock(&(cachep->nodelists[nodeid])-> | ||
3116 | list_lock); | ||
3117 | free_block(cachep, &objp, 1, nodeid); | ||
3118 | spin_unlock(&(cachep->nodelists[nodeid])-> | ||
3119 | list_lock); | ||
3120 | } | ||
3121 | return; | ||
3122 | } | ||
3123 | } | ||
3124 | #endif | ||
3125 | if (likely(ac->avail < ac->limit)) { | 3148 | if (likely(ac->avail < ac->limit)) { |
3126 | STATS_INC_FREEHIT(cachep); | 3149 | STATS_INC_FREEHIT(cachep); |
3127 | ac->entry[ac->avail++] = objp; | 3150 | ac->entry[ac->avail++] = objp; |
@@ -3254,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node); | |||
3254 | #endif | 3277 | #endif |
3255 | 3278 | ||
3256 | /** | 3279 | /** |
3257 | * kmalloc - allocate memory | 3280 | * __do_kmalloc - allocate memory |
3258 | * @size: how many bytes of memory are required. | 3281 | * @size: how many bytes of memory are required. |
3259 | * @flags: the type of memory to allocate. | 3282 | * @flags: the type of memory to allocate (see kmalloc). |
3260 | * @caller: function caller for debug tracking of the caller | 3283 | * @caller: function caller for debug tracking of the caller |
3261 | * | ||
3262 | * kmalloc is the normal method of allocating memory | ||
3263 | * in the kernel. | ||
3264 | * | ||
3265 | * The @flags argument may be one of: | ||
3266 | * | ||
3267 | * %GFP_USER - Allocate memory on behalf of user. May sleep. | ||
3268 | * | ||
3269 | * %GFP_KERNEL - Allocate normal kernel ram. May sleep. | ||
3270 | * | ||
3271 | * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. | ||
3272 | * | ||
3273 | * Additionally, the %GFP_DMA flag may be set to indicate the memory | ||
3274 | * must be suitable for DMA. This can mean different things on different | ||
3275 | * platforms. For example, on i386, it means that the memory must come | ||
3276 | * from the first 16MB. | ||
3277 | */ | 3284 | */ |
3278 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | 3285 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, |
3279 | void *caller) | 3286 | void *caller) |
@@ -3371,6 +3378,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3371 | { | 3378 | { |
3372 | unsigned long flags; | 3379 | unsigned long flags; |
3373 | 3380 | ||
3381 | BUG_ON(virt_to_cache(objp) != cachep); | ||
3382 | |||
3374 | local_irq_save(flags); | 3383 | local_irq_save(flags); |
3375 | __cache_free(cachep, objp); | 3384 | __cache_free(cachep, objp); |
3376 | local_irq_restore(flags); | 3385 | local_irq_restore(flags); |
@@ -3680,7 +3689,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | |||
3680 | */ | 3689 | */ |
3681 | static void cache_reap(void *unused) | 3690 | static void cache_reap(void *unused) |
3682 | { | 3691 | { |
3683 | struct list_head *walk; | 3692 | struct kmem_cache *searchp; |
3684 | struct kmem_list3 *l3; | 3693 | struct kmem_list3 *l3; |
3685 | int node = numa_node_id(); | 3694 | int node = numa_node_id(); |
3686 | 3695 | ||
@@ -3691,13 +3700,11 @@ static void cache_reap(void *unused) | |||
3691 | return; | 3700 | return; |
3692 | } | 3701 | } |
3693 | 3702 | ||
3694 | list_for_each(walk, &cache_chain) { | 3703 | list_for_each_entry(searchp, &cache_chain, next) { |
3695 | struct kmem_cache *searchp; | ||
3696 | struct list_head *p; | 3704 | struct list_head *p; |
3697 | int tofree; | 3705 | int tofree; |
3698 | struct slab *slabp; | 3706 | struct slab *slabp; |
3699 | 3707 | ||
3700 | searchp = list_entry(walk, struct kmem_cache, next); | ||
3701 | check_irq_on(); | 3708 | check_irq_on(); |
3702 | 3709 | ||
3703 | /* | 3710 | /* |
@@ -3825,7 +3832,6 @@ static void s_stop(struct seq_file *m, void *p) | |||
3825 | static int s_show(struct seq_file *m, void *p) | 3832 | static int s_show(struct seq_file *m, void *p) |
3826 | { | 3833 | { |
3827 | struct kmem_cache *cachep = p; | 3834 | struct kmem_cache *cachep = p; |
3828 | struct list_head *q; | ||
3829 | struct slab *slabp; | 3835 | struct slab *slabp; |
3830 | unsigned long active_objs; | 3836 | unsigned long active_objs; |
3831 | unsigned long num_objs; | 3837 | unsigned long num_objs; |
@@ -3846,15 +3852,13 @@ static int s_show(struct seq_file *m, void *p) | |||
3846 | check_irq_on(); | 3852 | check_irq_on(); |
3847 | spin_lock_irq(&l3->list_lock); | 3853 | spin_lock_irq(&l3->list_lock); |
3848 | 3854 | ||
3849 | list_for_each(q, &l3->slabs_full) { | 3855 | list_for_each_entry(slabp, &l3->slabs_full, list) { |
3850 | slabp = list_entry(q, struct slab, list); | ||
3851 | if (slabp->inuse != cachep->num && !error) | 3856 | if (slabp->inuse != cachep->num && !error) |
3852 | error = "slabs_full accounting error"; | 3857 | error = "slabs_full accounting error"; |
3853 | active_objs += cachep->num; | 3858 | active_objs += cachep->num; |
3854 | active_slabs++; | 3859 | active_slabs++; |
3855 | } | 3860 | } |
3856 | list_for_each(q, &l3->slabs_partial) { | 3861 | list_for_each_entry(slabp, &l3->slabs_partial, list) { |
3857 | slabp = list_entry(q, struct slab, list); | ||
3858 | if (slabp->inuse == cachep->num && !error) | 3862 | if (slabp->inuse == cachep->num && !error) |
3859 | error = "slabs_partial inuse accounting error"; | 3863 | error = "slabs_partial inuse accounting error"; |
3860 | if (!slabp->inuse && !error) | 3864 | if (!slabp->inuse && !error) |
@@ -3862,8 +3866,7 @@ static int s_show(struct seq_file *m, void *p) | |||
3862 | active_objs += slabp->inuse; | 3866 | active_objs += slabp->inuse; |
3863 | active_slabs++; | 3867 | active_slabs++; |
3864 | } | 3868 | } |
3865 | list_for_each(q, &l3->slabs_free) { | 3869 | list_for_each_entry(slabp, &l3->slabs_free, list) { |
3866 | slabp = list_entry(q, struct slab, list); | ||
3867 | if (slabp->inuse && !error) | 3870 | if (slabp->inuse && !error) |
3868 | error = "slabs_free/inuse accounting error"; | 3871 | error = "slabs_free/inuse accounting error"; |
3869 | num_slabs++; | 3872 | num_slabs++; |
@@ -3956,7 +3959,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3956 | { | 3959 | { |
3957 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; | 3960 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
3958 | int limit, batchcount, shared, res; | 3961 | int limit, batchcount, shared, res; |
3959 | struct list_head *p; | 3962 | struct kmem_cache *cachep; |
3960 | 3963 | ||
3961 | if (count > MAX_SLABINFO_WRITE) | 3964 | if (count > MAX_SLABINFO_WRITE) |
3962 | return -EINVAL; | 3965 | return -EINVAL; |
@@ -3975,10 +3978,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3975 | /* Find the cache in the chain of caches. */ | 3978 | /* Find the cache in the chain of caches. */ |
3976 | mutex_lock(&cache_chain_mutex); | 3979 | mutex_lock(&cache_chain_mutex); |
3977 | res = -EINVAL; | 3980 | res = -EINVAL; |
3978 | list_for_each(p, &cache_chain) { | 3981 | list_for_each_entry(cachep, &cache_chain, next) { |
3979 | struct kmem_cache *cachep; | ||
3980 | |||
3981 | cachep = list_entry(p, struct kmem_cache, next); | ||
3982 | if (!strcmp(cachep->name, kbuf)) { | 3982 | if (!strcmp(cachep->name, kbuf)) { |
3983 | if (limit < 1 || batchcount < 1 || | 3983 | if (limit < 1 || batchcount < 1 || |
3984 | batchcount > limit || shared < 0) { | 3984 | batchcount > limit || shared < 0) { |
@@ -4080,7 +4080,6 @@ static void show_symbol(struct seq_file *m, unsigned long address) | |||
4080 | static int leaks_show(struct seq_file *m, void *p) | 4080 | static int leaks_show(struct seq_file *m, void *p) |
4081 | { | 4081 | { |
4082 | struct kmem_cache *cachep = p; | 4082 | struct kmem_cache *cachep = p; |
4083 | struct list_head *q; | ||
4084 | struct slab *slabp; | 4083 | struct slab *slabp; |
4085 | struct kmem_list3 *l3; | 4084 | struct kmem_list3 *l3; |
4086 | const char *name; | 4085 | const char *name; |
@@ -4105,14 +4104,10 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4105 | check_irq_on(); | 4104 | check_irq_on(); |
4106 | spin_lock_irq(&l3->list_lock); | 4105 | spin_lock_irq(&l3->list_lock); |
4107 | 4106 | ||
4108 | list_for_each(q, &l3->slabs_full) { | 4107 | list_for_each_entry(slabp, &l3->slabs_full, list) |
4109 | slabp = list_entry(q, struct slab, list); | ||
4110 | handle_slab(n, cachep, slabp); | 4108 | handle_slab(n, cachep, slabp); |
4111 | } | 4109 | list_for_each_entry(slabp, &l3->slabs_partial, list) |
4112 | list_for_each(q, &l3->slabs_partial) { | ||
4113 | slabp = list_entry(q, struct slab, list); | ||
4114 | handle_slab(n, cachep, slabp); | 4110 | handle_slab(n, cachep, slabp); |
4115 | } | ||
4116 | spin_unlock_irq(&l3->list_lock); | 4111 | spin_unlock_irq(&l3->list_lock); |
4117 | } | 4112 | } |
4118 | name = cachep->name; | 4113 | name = cachep->name; |
diff --git a/mm/sparse.c b/mm/sparse.c index 100040c0dfb6..e0a3fe48aa37 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms) | |||
99 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 99 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
100 | } | 100 | } |
101 | 101 | ||
102 | /* | ||
103 | * During early boot, before section_mem_map is used for an actual | ||
104 | * mem_map, we use section_mem_map to store the section's NUMA | ||
105 | * node. This keeps us from having to use another data structure. The | ||
106 | * node information is cleared just before we store the real mem_map. | ||
107 | */ | ||
108 | static inline unsigned long sparse_encode_early_nid(int nid) | ||
109 | { | ||
110 | return (nid << SECTION_NID_SHIFT); | ||
111 | } | ||
112 | |||
113 | static inline int sparse_early_nid(struct mem_section *section) | ||
114 | { | ||
115 | return (section->section_mem_map >> SECTION_NID_SHIFT); | ||
116 | } | ||
117 | |||
102 | /* Record a memory area against a node. */ | 118 | /* Record a memory area against a node. */ |
103 | void memory_present(int nid, unsigned long start, unsigned long end) | 119 | void memory_present(int nid, unsigned long start, unsigned long end) |
104 | { | 120 | { |
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) | |||
113 | 129 | ||
114 | ms = __nr_to_section(section); | 130 | ms = __nr_to_section(section); |
115 | if (!ms->section_mem_map) | 131 | if (!ms->section_mem_map) |
116 | ms->section_mem_map = SECTION_MARKED_PRESENT; | 132 | ms->section_mem_map = sparse_encode_early_nid(nid) | |
133 | SECTION_MARKED_PRESENT; | ||
117 | } | 134 | } |
118 | } | 135 | } |
119 | 136 | ||
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms, | |||
164 | if (!valid_section(ms)) | 181 | if (!valid_section(ms)) |
165 | return -EINVAL; | 182 | return -EINVAL; |
166 | 183 | ||
184 | ms->section_mem_map &= ~SECTION_MAP_MASK; | ||
167 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); | 185 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); |
168 | 186 | ||
169 | return 1; | 187 | return 1; |
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms, | |||
172 | static struct page *sparse_early_mem_map_alloc(unsigned long pnum) | 190 | static struct page *sparse_early_mem_map_alloc(unsigned long pnum) |
173 | { | 191 | { |
174 | struct page *map; | 192 | struct page *map; |
175 | int nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); | ||
176 | struct mem_section *ms = __nr_to_section(pnum); | 193 | struct mem_section *ms = __nr_to_section(pnum); |
194 | int nid = sparse_early_nid(ms); | ||
177 | 195 | ||
178 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); | 196 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); |
179 | if (map) | 197 | if (map) |
@@ -480,48 +480,6 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
480 | #endif /* CONFIG_HOTPLUG_CPU */ | 480 | #endif /* CONFIG_HOTPLUG_CPU */ |
481 | #endif /* CONFIG_SMP */ | 481 | #endif /* CONFIG_SMP */ |
482 | 482 | ||
483 | #ifdef CONFIG_SMP | ||
484 | void percpu_counter_mod(struct percpu_counter *fbc, long amount) | ||
485 | { | ||
486 | long count; | ||
487 | long *pcount; | ||
488 | int cpu = get_cpu(); | ||
489 | |||
490 | pcount = per_cpu_ptr(fbc->counters, cpu); | ||
491 | count = *pcount + amount; | ||
492 | if (count >= FBC_BATCH || count <= -FBC_BATCH) { | ||
493 | spin_lock(&fbc->lock); | ||
494 | fbc->count += count; | ||
495 | *pcount = 0; | ||
496 | spin_unlock(&fbc->lock); | ||
497 | } else { | ||
498 | *pcount = count; | ||
499 | } | ||
500 | put_cpu(); | ||
501 | } | ||
502 | EXPORT_SYMBOL(percpu_counter_mod); | ||
503 | |||
504 | /* | ||
505 | * Add up all the per-cpu counts, return the result. This is a more accurate | ||
506 | * but much slower version of percpu_counter_read_positive() | ||
507 | */ | ||
508 | long percpu_counter_sum(struct percpu_counter *fbc) | ||
509 | { | ||
510 | long ret; | ||
511 | int cpu; | ||
512 | |||
513 | spin_lock(&fbc->lock); | ||
514 | ret = fbc->count; | ||
515 | for_each_possible_cpu(cpu) { | ||
516 | long *pcount = per_cpu_ptr(fbc->counters, cpu); | ||
517 | ret += *pcount; | ||
518 | } | ||
519 | spin_unlock(&fbc->lock); | ||
520 | return ret < 0 ? 0 : ret; | ||
521 | } | ||
522 | EXPORT_SYMBOL(percpu_counter_sum); | ||
523 | #endif | ||
524 | |||
525 | /* | 483 | /* |
526 | * Perform any setup for the swap system | 484 | * Perform any setup for the swap system |
527 | */ | 485 | */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index e5fd5385f0cc..cc367f7e75d8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry) | |||
395 | struct swap_info_struct * p; | 395 | struct swap_info_struct * p; |
396 | struct page *page = NULL; | 396 | struct page *page = NULL; |
397 | 397 | ||
398 | if (is_migration_entry(entry)) | ||
399 | return; | ||
400 | |||
398 | p = swap_info_get(entry); | 401 | p = swap_info_get(entry); |
399 | if (p) { | 402 | if (p) { |
400 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 403 | if (swap_entry_free(p, swp_offset(entry)) == 1) { |
@@ -615,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm, | |||
615 | return 0; | 618 | return 0; |
616 | } | 619 | } |
617 | 620 | ||
618 | #ifdef CONFIG_MIGRATION | ||
619 | int remove_vma_swap(struct vm_area_struct *vma, struct page *page) | ||
620 | { | ||
621 | swp_entry_t entry = { .val = page_private(page) }; | ||
622 | |||
623 | return unuse_vma(vma, entry, page); | ||
624 | } | ||
625 | #endif | ||
626 | |||
627 | /* | 621 | /* |
628 | * Scan swap_map from current position to next entry still in use. | 622 | * Scan swap_map from current position to next entry still in use. |
629 | * Recycle to start on reaching the end, returning 0 when empty. | 623 | * Recycle to start on reaching the end, returning 0 when empty. |
@@ -716,7 +710,6 @@ static int try_to_unuse(unsigned int type) | |||
716 | */ | 710 | */ |
717 | swap_map = &si->swap_map[i]; | 711 | swap_map = &si->swap_map[i]; |
718 | entry = swp_entry(type, i); | 712 | entry = swp_entry(type, i); |
719 | again: | ||
720 | page = read_swap_cache_async(entry, NULL, 0); | 713 | page = read_swap_cache_async(entry, NULL, 0); |
721 | if (!page) { | 714 | if (!page) { |
722 | /* | 715 | /* |
@@ -751,12 +744,6 @@ again: | |||
751 | wait_on_page_locked(page); | 744 | wait_on_page_locked(page); |
752 | wait_on_page_writeback(page); | 745 | wait_on_page_writeback(page); |
753 | lock_page(page); | 746 | lock_page(page); |
754 | if (!PageSwapCache(page)) { | ||
755 | /* Page migration has occured */ | ||
756 | unlock_page(page); | ||
757 | page_cache_release(page); | ||
758 | goto again; | ||
759 | } | ||
760 | wait_on_page_writeback(page); | 747 | wait_on_page_writeback(page); |
761 | 748 | ||
762 | /* | 749 | /* |
@@ -785,10 +772,8 @@ again: | |||
785 | while (*swap_map > 1 && !retval && | 772 | while (*swap_map > 1 && !retval && |
786 | (p = p->next) != &start_mm->mmlist) { | 773 | (p = p->next) != &start_mm->mmlist) { |
787 | mm = list_entry(p, struct mm_struct, mmlist); | 774 | mm = list_entry(p, struct mm_struct, mmlist); |
788 | if (atomic_inc_return(&mm->mm_users) == 1) { | 775 | if (!atomic_inc_not_zero(&mm->mm_users)) |
789 | atomic_dec(&mm->mm_users); | ||
790 | continue; | 776 | continue; |
791 | } | ||
792 | spin_unlock(&mmlist_lock); | 777 | spin_unlock(&mmlist_lock); |
793 | mmput(prev_mm); | 778 | mmput(prev_mm); |
794 | prev_mm = mm; | 779 | prev_mm = mm; |
@@ -1407,19 +1392,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1407 | if (!(p->flags & SWP_USED)) | 1392 | if (!(p->flags & SWP_USED)) |
1408 | break; | 1393 | break; |
1409 | error = -EPERM; | 1394 | error = -EPERM; |
1410 | /* | 1395 | if (type >= MAX_SWAPFILES) { |
1411 | * Test if adding another swap device is possible. There are | ||
1412 | * two limiting factors: 1) the number of bits for the swap | ||
1413 | * type swp_entry_t definition and 2) the number of bits for | ||
1414 | * the swap type in the swap ptes as defined by the different | ||
1415 | * architectures. To honor both limitations a swap entry | ||
1416 | * with swap offset 0 and swap type ~0UL is created, encoded | ||
1417 | * to a swap pte, decoded to a swp_entry_t again and finally | ||
1418 | * the swap type part is extracted. This will mask all bits | ||
1419 | * from the initial ~0UL that can't be encoded in either the | ||
1420 | * swp_entry_t or the architecture definition of a swap pte. | ||
1421 | */ | ||
1422 | if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { | ||
1423 | spin_unlock(&swap_lock); | 1396 | spin_unlock(&swap_lock); |
1424 | goto out; | 1397 | goto out; |
1425 | } | 1398 | } |
@@ -1504,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1504 | error = -EINVAL; | 1477 | error = -EINVAL; |
1505 | goto bad_swap; | 1478 | goto bad_swap; |
1506 | } | 1479 | } |
1507 | page = read_cache_page(mapping, 0, | 1480 | page = read_mapping_page(mapping, 0, swap_file); |
1508 | (filler_t *)mapping->a_ops->readpage, swap_file); | ||
1509 | if (IS_ERR(page)) { | 1481 | if (IS_ERR(page)) { |
1510 | error = PTR_ERR(page); | 1482 | error = PTR_ERR(page); |
1511 | goto bad_swap; | 1483 | goto bad_swap; |
@@ -1709,6 +1681,9 @@ int swap_duplicate(swp_entry_t entry) | |||
1709 | unsigned long offset, type; | 1681 | unsigned long offset, type; |
1710 | int result = 0; | 1682 | int result = 0; |
1711 | 1683 | ||
1684 | if (is_migration_entry(entry)) | ||
1685 | return 1; | ||
1686 | |||
1712 | type = swp_type(entry); | 1687 | type = swp_type(entry); |
1713 | if (type >= nr_swapfiles) | 1688 | if (type >= nr_swapfiles) |
1714 | goto bad_file; | 1689 | goto bad_file; |
diff --git a/mm/truncate.c b/mm/truncate.c index 6cb3fff25f67..cf1b015df4a7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
230 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 230 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
231 | for (i = 0; i < pagevec_count(&pvec); i++) { | 231 | for (i = 0; i < pagevec_count(&pvec); i++) { |
232 | struct page *page = pvec.pages[i]; | 232 | struct page *page = pvec.pages[i]; |
233 | pgoff_t index; | ||
234 | int lock_failed; | ||
233 | 235 | ||
234 | if (TestSetPageLocked(page)) { | 236 | lock_failed = TestSetPageLocked(page); |
235 | next++; | 237 | |
236 | continue; | 238 | /* |
237 | } | 239 | * We really shouldn't be looking at the ->index of an |
238 | if (page->index > next) | 240 | * unlocked page. But we're not allowed to lock these |
239 | next = page->index; | 241 | * pages. So we rely upon nobody altering the ->index |
242 | * of this (pinned-by-us) page. | ||
243 | */ | ||
244 | index = page->index; | ||
245 | if (index > next) | ||
246 | next = index; | ||
240 | next++; | 247 | next++; |
248 | if (lock_failed) | ||
249 | continue; | ||
250 | |||
241 | if (PageDirty(page) || PageWriteback(page)) | 251 | if (PageDirty(page) || PageWriteback(page)) |
242 | goto unlock; | 252 | goto unlock; |
243 | if (page_mapped(page)) | 253 | if (page_mapped(page)) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c0504f1e34eb..35f8553f893a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int | |||
257 | } | 257 | } |
258 | 258 | ||
259 | /* Caller must hold vmlist_lock */ | 259 | /* Caller must hold vmlist_lock */ |
260 | static struct vm_struct *__find_vm_area(void *addr) | ||
261 | { | ||
262 | struct vm_struct *tmp; | ||
263 | |||
264 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | ||
265 | if (tmp->addr == addr) | ||
266 | break; | ||
267 | } | ||
268 | |||
269 | return tmp; | ||
270 | } | ||
271 | |||
272 | /* Caller must hold vmlist_lock */ | ||
260 | struct vm_struct *__remove_vm_area(void *addr) | 273 | struct vm_struct *__remove_vm_area(void *addr) |
261 | { | 274 | { |
262 | struct vm_struct **p, *tmp; | 275 | struct vm_struct **p, *tmp; |
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc); | |||
498 | */ | 511 | */ |
499 | void *vmalloc(unsigned long size) | 512 | void *vmalloc(unsigned long size) |
500 | { | 513 | { |
501 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 514 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); |
502 | } | 515 | } |
503 | EXPORT_SYMBOL(vmalloc); | 516 | EXPORT_SYMBOL(vmalloc); |
504 | 517 | ||
505 | /** | 518 | /** |
519 | * vmalloc_user - allocate virtually contiguous memory which has | ||
520 | * been zeroed so it can be mapped to userspace without | ||
521 | * leaking data. | ||
522 | * | ||
523 | * @size: allocation size | ||
524 | */ | ||
525 | void *vmalloc_user(unsigned long size) | ||
526 | { | ||
527 | struct vm_struct *area; | ||
528 | void *ret; | ||
529 | |||
530 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | ||
531 | write_lock(&vmlist_lock); | ||
532 | area = __find_vm_area(ret); | ||
533 | area->flags |= VM_USERMAP; | ||
534 | write_unlock(&vmlist_lock); | ||
535 | |||
536 | return ret; | ||
537 | } | ||
538 | EXPORT_SYMBOL(vmalloc_user); | ||
539 | |||
540 | /** | ||
506 | * vmalloc_node - allocate memory on a specific node | 541 | * vmalloc_node - allocate memory on a specific node |
507 | * | 542 | * |
508 | * @size: allocation size | 543 | * @size: allocation size |
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc); | |||
516 | */ | 551 | */ |
517 | void *vmalloc_node(unsigned long size, int node) | 552 | void *vmalloc_node(unsigned long size, int node) |
518 | { | 553 | { |
519 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); | 554 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); |
520 | } | 555 | } |
521 | EXPORT_SYMBOL(vmalloc_node); | 556 | EXPORT_SYMBOL(vmalloc_node); |
522 | 557 | ||
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size) | |||
556 | } | 591 | } |
557 | EXPORT_SYMBOL(vmalloc_32); | 592 | EXPORT_SYMBOL(vmalloc_32); |
558 | 593 | ||
594 | /** | ||
595 | * vmalloc_32_user - allocate virtually contiguous memory (32bit | ||
596 | * addressable) which is zeroed so it can be | ||
597 | * mapped to userspace without leaking data. | ||
598 | * | ||
599 | * @size: allocation size | ||
600 | */ | ||
601 | void *vmalloc_32_user(unsigned long size) | ||
602 | { | ||
603 | struct vm_struct *area; | ||
604 | void *ret; | ||
605 | |||
606 | ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); | ||
607 | write_lock(&vmlist_lock); | ||
608 | area = __find_vm_area(ret); | ||
609 | area->flags |= VM_USERMAP; | ||
610 | write_unlock(&vmlist_lock); | ||
611 | |||
612 | return ret; | ||
613 | } | ||
614 | EXPORT_SYMBOL(vmalloc_32_user); | ||
615 | |||
559 | long vread(char *buf, char *addr, unsigned long count) | 616 | long vread(char *buf, char *addr, unsigned long count) |
560 | { | 617 | { |
561 | struct vm_struct *tmp; | 618 | struct vm_struct *tmp; |
@@ -630,3 +687,64 @@ finished: | |||
630 | read_unlock(&vmlist_lock); | 687 | read_unlock(&vmlist_lock); |
631 | return buf - buf_start; | 688 | return buf - buf_start; |
632 | } | 689 | } |
690 | |||
691 | /** | ||
692 | * remap_vmalloc_range - map vmalloc pages to userspace | ||
693 | * | ||
694 | * @vma: vma to cover (map full range of vma) | ||
695 | * @addr: vmalloc memory | ||
696 | * @pgoff: number of pages into addr before first page to map | ||
697 | * @returns: 0 for success, -Exxx on failure | ||
698 | * | ||
699 | * This function checks that addr is a valid vmalloc'ed area, and | ||
700 | * that it is big enough to cover the vma. Will return failure if | ||
701 | * that criteria isn't met. | ||
702 | * | ||
703 | * Similar to remap_pfn_range (see mm/memory.c) | ||
704 | */ | ||
705 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | ||
706 | unsigned long pgoff) | ||
707 | { | ||
708 | struct vm_struct *area; | ||
709 | unsigned long uaddr = vma->vm_start; | ||
710 | unsigned long usize = vma->vm_end - vma->vm_start; | ||
711 | int ret; | ||
712 | |||
713 | if ((PAGE_SIZE-1) & (unsigned long)addr) | ||
714 | return -EINVAL; | ||
715 | |||
716 | read_lock(&vmlist_lock); | ||
717 | area = __find_vm_area(addr); | ||
718 | if (!area) | ||
719 | goto out_einval_locked; | ||
720 | |||
721 | if (!(area->flags & VM_USERMAP)) | ||
722 | goto out_einval_locked; | ||
723 | |||
724 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | ||
725 | goto out_einval_locked; | ||
726 | read_unlock(&vmlist_lock); | ||
727 | |||
728 | addr += pgoff << PAGE_SHIFT; | ||
729 | do { | ||
730 | struct page *page = vmalloc_to_page(addr); | ||
731 | ret = vm_insert_page(vma, uaddr, page); | ||
732 | if (ret) | ||
733 | return ret; | ||
734 | |||
735 | uaddr += PAGE_SIZE; | ||
736 | addr += PAGE_SIZE; | ||
737 | usize -= PAGE_SIZE; | ||
738 | } while (usize > 0); | ||
739 | |||
740 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | ||
741 | vma->vm_flags |= VM_RESERVED; | ||
742 | |||
743 | return ret; | ||
744 | |||
745 | out_einval_locked: | ||
746 | read_unlock(&vmlist_lock); | ||
747 | return -EINVAL; | ||
748 | } | ||
749 | EXPORT_SYMBOL(remap_vmalloc_range); | ||
750 | |||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e9..72babac71dea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -61,6 +61,8 @@ struct scan_control { | |||
61 | * In this context, it doesn't matter that we scan the | 61 | * In this context, it doesn't matter that we scan the |
62 | * whole list at once. */ | 62 | * whole list at once. */ |
63 | int swap_cluster_max; | 63 | int swap_cluster_max; |
64 | |||
65 | int swappiness; | ||
64 | }; | 66 | }; |
65 | 67 | ||
66 | /* | 68 | /* |
@@ -108,7 +110,7 @@ struct shrinker { | |||
108 | * From 0 .. 100. Higher means more swappy. | 110 | * From 0 .. 100. Higher means more swappy. |
109 | */ | 111 | */ |
110 | int vm_swappiness = 60; | 112 | int vm_swappiness = 60; |
111 | static long total_memory; | 113 | long vm_total_pages; /* The total number of pages which the VM controls */ |
112 | 114 | ||
113 | static LIST_HEAD(shrinker_list); | 115 | static LIST_HEAD(shrinker_list); |
114 | static DECLARE_RWSEM(shrinker_rwsem); | 116 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping, | |||
288 | unlock_page(page); | 290 | unlock_page(page); |
289 | } | 291 | } |
290 | 292 | ||
293 | /* possible outcome of pageout() */ | ||
294 | typedef enum { | ||
295 | /* failed to write page out, page is locked */ | ||
296 | PAGE_KEEP, | ||
297 | /* move page to the active list, page is locked */ | ||
298 | PAGE_ACTIVATE, | ||
299 | /* page has been sent to the disk successfully, page is unlocked */ | ||
300 | PAGE_SUCCESS, | ||
301 | /* page is clean and locked */ | ||
302 | PAGE_CLEAN, | ||
303 | } pageout_t; | ||
304 | |||
291 | /* | 305 | /* |
292 | * pageout is called by shrink_page_list() for each dirty page. | 306 | * pageout is called by shrink_page_list() for each dirty page. |
293 | * Calls ->writepage(). | 307 | * Calls ->writepage(). |
294 | */ | 308 | */ |
295 | pageout_t pageout(struct page *page, struct address_space *mapping) | 309 | static pageout_t pageout(struct page *page, struct address_space *mapping) |
296 | { | 310 | { |
297 | /* | 311 | /* |
298 | * If the page is dirty, only perform writeback if that write | 312 | * If the page is dirty, only perform writeback if that write |
@@ -337,6 +351,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping) | |||
337 | struct writeback_control wbc = { | 351 | struct writeback_control wbc = { |
338 | .sync_mode = WB_SYNC_NONE, | 352 | .sync_mode = WB_SYNC_NONE, |
339 | .nr_to_write = SWAP_CLUSTER_MAX, | 353 | .nr_to_write = SWAP_CLUSTER_MAX, |
354 | .range_start = 0, | ||
355 | .range_end = LLONG_MAX, | ||
340 | .nonblocking = 1, | 356 | .nonblocking = 1, |
341 | .for_reclaim = 1, | 357 | .for_reclaim = 1, |
342 | }; | 358 | }; |
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
727 | * how much memory | 743 | * how much memory |
728 | * is mapped. | 744 | * is mapped. |
729 | */ | 745 | */ |
730 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | 746 | mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; |
731 | 747 | ||
732 | /* | 748 | /* |
733 | * Now decide how much we really want to unmap some pages. The | 749 | * Now decide how much we really want to unmap some pages. The |
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
741 | * A 100% value of vm_swappiness overrides this algorithm | 757 | * A 100% value of vm_swappiness overrides this algorithm |
742 | * altogether. | 758 | * altogether. |
743 | */ | 759 | */ |
744 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | 760 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
745 | 761 | ||
746 | /* | 762 | /* |
747 | * Now use this metric to decide whether to start moving mapped | 763 | * Now use this metric to decide whether to start moving mapped |
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
957 | .may_writepage = !laptop_mode, | 973 | .may_writepage = !laptop_mode, |
958 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 974 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | 975 | .may_swap = 1, |
976 | .swappiness = vm_swappiness, | ||
960 | }; | 977 | }; |
961 | 978 | ||
962 | inc_page_state(allocstall); | 979 | inc_page_state(allocstall); |
@@ -1021,10 +1038,6 @@ out: | |||
1021 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1038 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1022 | * they are all at pages_high. | 1039 | * they are all at pages_high. |
1023 | * | 1040 | * |
1024 | * If `nr_pages' is non-zero then it is the number of pages which are to be | ||
1025 | * reclaimed, regardless of the zone occupancies. This is a software suspend | ||
1026 | * special. | ||
1027 | * | ||
1028 | * Returns the number of pages which were actually freed. | 1041 | * Returns the number of pages which were actually freed. |
1029 | * | 1042 | * |
1030 | * There is special handling here for zones which are full of pinned pages. | 1043 | * There is special handling here for zones which are full of pinned pages. |
@@ -1042,10 +1055,8 @@ out: | |||
1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1055 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1043 | * across the zones. | 1056 | * across the zones. |
1044 | */ | 1057 | */ |
1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | 1058 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1046 | int order) | ||
1047 | { | 1059 | { |
1048 | unsigned long to_free = nr_pages; | ||
1049 | int all_zones_ok; | 1060 | int all_zones_ok; |
1050 | int priority; | 1061 | int priority; |
1051 | int i; | 1062 | int i; |
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | |||
1055 | struct scan_control sc = { | 1066 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | 1067 | .gfp_mask = GFP_KERNEL, |
1057 | .may_swap = 1, | 1068 | .may_swap = 1, |
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | 1069 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1070 | .swappiness = vm_swappiness, | ||
1059 | }; | 1071 | }; |
1060 | 1072 | ||
1061 | loop_again: | 1073 | loop_again: |
@@ -1082,31 +1094,26 @@ loop_again: | |||
1082 | 1094 | ||
1083 | all_zones_ok = 1; | 1095 | all_zones_ok = 1; |
1084 | 1096 | ||
1085 | if (nr_pages == 0) { | 1097 | /* |
1086 | /* | 1098 | * Scan in the highmem->dma direction for the highest |
1087 | * Scan in the highmem->dma direction for the highest | 1099 | * zone which needs scanning |
1088 | * zone which needs scanning | 1100 | */ |
1089 | */ | 1101 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1090 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1102 | struct zone *zone = pgdat->node_zones + i; |
1091 | struct zone *zone = pgdat->node_zones + i; | ||
1092 | 1103 | ||
1093 | if (!populated_zone(zone)) | 1104 | if (!populated_zone(zone)) |
1094 | continue; | 1105 | continue; |
1095 | 1106 | ||
1096 | if (zone->all_unreclaimable && | 1107 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1097 | priority != DEF_PRIORITY) | 1108 | continue; |
1098 | continue; | ||
1099 | 1109 | ||
1100 | if (!zone_watermark_ok(zone, order, | 1110 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1101 | zone->pages_high, 0, 0)) { | 1111 | 0, 0)) { |
1102 | end_zone = i; | 1112 | end_zone = i; |
1103 | goto scan; | 1113 | goto scan; |
1104 | } | ||
1105 | } | 1114 | } |
1106 | goto out; | ||
1107 | } else { | ||
1108 | end_zone = pgdat->nr_zones - 1; | ||
1109 | } | 1115 | } |
1116 | goto out; | ||
1110 | scan: | 1117 | scan: |
1111 | for (i = 0; i <= end_zone; i++) { | 1118 | for (i = 0; i <= end_zone; i++) { |
1112 | struct zone *zone = pgdat->node_zones + i; | 1119 | struct zone *zone = pgdat->node_zones + i; |
@@ -1133,11 +1140,9 @@ scan: | |||
1133 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1140 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1134 | continue; | 1141 | continue; |
1135 | 1142 | ||
1136 | if (nr_pages == 0) { /* Not software suspend */ | 1143 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1137 | if (!zone_watermark_ok(zone, order, | 1144 | end_zone, 0)) |
1138 | zone->pages_high, end_zone, 0)) | 1145 | all_zones_ok = 0; |
1139 | all_zones_ok = 0; | ||
1140 | } | ||
1141 | zone->temp_priority = priority; | 1146 | zone->temp_priority = priority; |
1142 | if (zone->prev_priority > priority) | 1147 | if (zone->prev_priority > priority) |
1143 | zone->prev_priority = priority; | 1148 | zone->prev_priority = priority; |
@@ -1162,8 +1167,6 @@ scan: | |||
1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1167 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1163 | sc.may_writepage = 1; | 1168 | sc.may_writepage = 1; |
1164 | } | 1169 | } |
1165 | if (nr_pages && to_free > nr_reclaimed) | ||
1166 | continue; /* swsusp: need to do more work */ | ||
1167 | if (all_zones_ok) | 1170 | if (all_zones_ok) |
1168 | break; /* kswapd: all done */ | 1171 | break; /* kswapd: all done */ |
1169 | /* | 1172 | /* |
@@ -1179,7 +1182,7 @@ scan: | |||
1179 | * matches the direct reclaim path behaviour in terms of impact | 1182 | * matches the direct reclaim path behaviour in terms of impact |
1180 | * on zone->*_priority. | 1183 | * on zone->*_priority. |
1181 | */ | 1184 | */ |
1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) | 1185 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) |
1183 | break; | 1186 | break; |
1184 | } | 1187 | } |
1185 | out: | 1188 | out: |
@@ -1261,7 +1264,7 @@ static int kswapd(void *p) | |||
1261 | } | 1264 | } |
1262 | finish_wait(&pgdat->kswapd_wait, &wait); | 1265 | finish_wait(&pgdat->kswapd_wait, &wait); |
1263 | 1266 | ||
1264 | balance_pgdat(pgdat, 0, order); | 1267 | balance_pgdat(pgdat, order); |
1265 | } | 1268 | } |
1266 | return 0; | 1269 | return 0; |
1267 | } | 1270 | } |
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1290 | 1293 | ||
1291 | #ifdef CONFIG_PM | 1294 | #ifdef CONFIG_PM |
1292 | /* | 1295 | /* |
1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1296 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
1294 | * pages. | 1297 | * from LRU lists system-wide, for given pass and priority, and returns the |
1298 | * number of reclaimed pages | ||
1299 | * | ||
1300 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
1301 | */ | ||
1302 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | ||
1303 | int prio, struct scan_control *sc) | ||
1304 | { | ||
1305 | struct zone *zone; | ||
1306 | unsigned long nr_to_scan, ret = 0; | ||
1307 | |||
1308 | for_each_zone(zone) { | ||
1309 | |||
1310 | if (!populated_zone(zone)) | ||
1311 | continue; | ||
1312 | |||
1313 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | ||
1314 | continue; | ||
1315 | |||
1316 | /* For pass = 0 we don't shrink the active list */ | ||
1317 | if (pass > 0) { | ||
1318 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; | ||
1319 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | ||
1320 | zone->nr_scan_active = 0; | ||
1321 | nr_to_scan = min(nr_pages, zone->nr_active); | ||
1322 | shrink_active_list(nr_to_scan, zone, sc); | ||
1323 | } | ||
1324 | } | ||
1325 | |||
1326 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; | ||
1327 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
1328 | zone->nr_scan_inactive = 0; | ||
1329 | nr_to_scan = min(nr_pages, zone->nr_inactive); | ||
1330 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
1331 | if (ret >= nr_pages) | ||
1332 | return ret; | ||
1333 | } | ||
1334 | } | ||
1335 | |||
1336 | return ret; | ||
1337 | } | ||
1338 | |||
1339 | /* | ||
1340 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
1341 | * freed pages. | ||
1342 | * | ||
1343 | * Rather than trying to age LRUs the aim is to preserve the overall | ||
1344 | * LRU order by reclaiming preferentially | ||
1345 | * inactive > active > active referenced > active mapped | ||
1295 | */ | 1346 | */ |
1296 | unsigned long shrink_all_memory(unsigned long nr_pages) | 1347 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1297 | { | 1348 | { |
1298 | pg_data_t *pgdat; | 1349 | unsigned long lru_pages, nr_slab; |
1299 | unsigned long nr_to_free = nr_pages; | ||
1300 | unsigned long ret = 0; | 1350 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | 1351 | int pass; |
1302 | struct reclaim_state reclaim_state = { | 1352 | struct reclaim_state reclaim_state; |
1303 | .reclaimed_slab = 0, | 1353 | struct zone *zone; |
1354 | struct scan_control sc = { | ||
1355 | .gfp_mask = GFP_KERNEL, | ||
1356 | .may_swap = 0, | ||
1357 | .swap_cluster_max = nr_pages, | ||
1358 | .may_writepage = 1, | ||
1359 | .swappiness = vm_swappiness, | ||
1304 | }; | 1360 | }; |
1305 | 1361 | ||
1306 | current->reclaim_state = &reclaim_state; | 1362 | current->reclaim_state = &reclaim_state; |
1307 | repeat: | ||
1308 | for_each_online_pgdat(pgdat) { | ||
1309 | unsigned long freed; | ||
1310 | 1363 | ||
1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1364 | lru_pages = 0; |
1312 | ret += freed; | 1365 | for_each_zone(zone) |
1313 | nr_to_free -= freed; | 1366 | lru_pages += zone->nr_active + zone->nr_inactive; |
1314 | if ((long)nr_to_free <= 0) | 1367 | |
1368 | nr_slab = read_page_state(nr_slab); | ||
1369 | /* If slab caches are huge, it's better to hit them first */ | ||
1370 | while (nr_slab >= lru_pages) { | ||
1371 | reclaim_state.reclaimed_slab = 0; | ||
1372 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1373 | if (!reclaim_state.reclaimed_slab) | ||
1315 | break; | 1374 | break; |
1375 | |||
1376 | ret += reclaim_state.reclaimed_slab; | ||
1377 | if (ret >= nr_pages) | ||
1378 | goto out; | ||
1379 | |||
1380 | nr_slab -= reclaim_state.reclaimed_slab; | ||
1316 | } | 1381 | } |
1317 | if (retry-- && ret < nr_pages) { | 1382 | |
1318 | blk_congestion_wait(WRITE, HZ/5); | 1383 | /* |
1319 | goto repeat; | 1384 | * We try to shrink LRUs in 5 passes: |
1385 | * 0 = Reclaim from inactive_list only | ||
1386 | * 1 = Reclaim from active list but don't reclaim mapped | ||
1387 | * 2 = 2nd pass of type 1 | ||
1388 | * 3 = Reclaim mapped (normal reclaim) | ||
1389 | * 4 = 2nd pass of type 3 | ||
1390 | */ | ||
1391 | for (pass = 0; pass < 5; pass++) { | ||
1392 | int prio; | ||
1393 | |||
1394 | /* Needed for shrinking slab caches later on */ | ||
1395 | if (!lru_pages) | ||
1396 | for_each_zone(zone) { | ||
1397 | lru_pages += zone->nr_active; | ||
1398 | lru_pages += zone->nr_inactive; | ||
1399 | } | ||
1400 | |||
1401 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
1402 | if (pass > 2) { | ||
1403 | sc.may_swap = 1; | ||
1404 | sc.swappiness = 100; | ||
1405 | } | ||
1406 | |||
1407 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
1408 | unsigned long nr_to_scan = nr_pages - ret; | ||
1409 | |||
1410 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1411 | sc.nr_scanned = 0; | ||
1412 | |||
1413 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
1414 | if (ret >= nr_pages) | ||
1415 | goto out; | ||
1416 | |||
1417 | reclaim_state.reclaimed_slab = 0; | ||
1418 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | ||
1419 | ret += reclaim_state.reclaimed_slab; | ||
1420 | if (ret >= nr_pages) | ||
1421 | goto out; | ||
1422 | |||
1423 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
1424 | blk_congestion_wait(WRITE, HZ / 10); | ||
1425 | } | ||
1426 | |||
1427 | lru_pages = 0; | ||
1320 | } | 1428 | } |
1429 | |||
1430 | /* | ||
1431 | * If ret = 0, we could not shrink LRUs, but there may be something | ||
1432 | * in slab caches | ||
1433 | */ | ||
1434 | if (!ret) | ||
1435 | do { | ||
1436 | reclaim_state.reclaimed_slab = 0; | ||
1437 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1438 | ret += reclaim_state.reclaimed_slab; | ||
1439 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | ||
1440 | |||
1441 | out: | ||
1321 | current->reclaim_state = NULL; | 1442 | current->reclaim_state = NULL; |
1443 | |||
1322 | return ret; | 1444 | return ret; |
1323 | } | 1445 | } |
1324 | #endif | 1446 | #endif |
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void) | |||
1360 | pgdat->kswapd = find_task_by_pid(pid); | 1482 | pgdat->kswapd = find_task_by_pid(pid); |
1361 | read_unlock(&tasklist_lock); | 1483 | read_unlock(&tasklist_lock); |
1362 | } | 1484 | } |
1363 | total_memory = nr_free_pagecache_pages(); | ||
1364 | hotcpu_notifier(cpu_callback, 0); | 1485 | hotcpu_notifier(cpu_callback, 0); |
1365 | return 0; | 1486 | return 0; |
1366 | } | 1487 | } |
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1416 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1537 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1417 | SWAP_CLUSTER_MAX), | 1538 | SWAP_CLUSTER_MAX), |
1418 | .gfp_mask = gfp_mask, | 1539 | .gfp_mask = gfp_mask, |
1540 | .swappiness = vm_swappiness, | ||
1419 | }; | 1541 | }; |
1420 | 1542 | ||
1421 | disable_swap_token(); | 1543 | disable_swap_token(); |