aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig13
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c257
-rw-r--r--mm/filemap.h36
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c9
-rw-r--r--mm/highmem.c6
-rw-r--r--mm/hugetlb.c282
-rw-r--r--mm/memory.c133
-rw-r--r--mm/memory_hotplug.c154
-rw-r--r--mm/mempolicy.c54
-rw-r--r--mm/migrate.c1076
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mmzone.c1
-rw-r--r--mm/mprotect.c37
-rw-r--r--mm/msync.c3
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c100
-rw-r--r--mm/page_alloc.c670
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/pdflush.c18
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/rmap.c121
-rw-r--r--mm/shmem.c34
-rw-r--r--mm/slab.c392
-rw-r--r--mm/slob.c1
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swap.c49
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c44
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/vmalloc.c122
-rw-r--r--mm/vmscan.c341
-rw-r--r--mm/vmstat.c614
36 files changed, 2879 insertions, 1798 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 332f5c29b53a..8f5b45615f7b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,7 +115,8 @@ config SPARSEMEM_EXTREME
115# eventually, we can have this option just 'select SPARSEMEM' 115# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG 116config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add" 117 bool "Allow for memory hot-add"
118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND 118 depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
119 depends on (IA64 || X86 || PPC64)
119 120
120comment "Memory hotplug is currently incompatible with Software Suspend" 121comment "Memory hotplug is currently incompatible with Software Suspend"
121 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND 122 depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
@@ -138,10 +139,16 @@ config SPLIT_PTLOCK_CPUS
138# 139#
139config MIGRATION 140config MIGRATION
140 bool "Page migration" 141 bool "Page migration"
141 def_bool y if NUMA 142 def_bool y
142 depends on SWAP && NUMA 143 depends on NUMA
143 help 144 help
144 Allows the migration of the physical location of pages of processes 145 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for 146 while the virtual addresses are not changed. This is useful for
146 example on NUMA systems to put pages nearer to the processors accessing 147 example on NUMA systems to put pages nearer to the processors accessing
147 the page. 148 the page.
149
150config RESOURCES_64BIT
151 bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
152 default 64BIT
153 help
154 This option allows memory and IO resources to be 64 bit.
diff --git a/mm/Makefile b/mm/Makefile
index 0b8f73f2ed16..9dd824c11eeb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o $(mmu-y) 13 prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
14 14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
diff --git a/mm/filemap.c b/mm/filemap.c
index a02a0b2c986b..b9c91ab7f0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,11 +9,11 @@
9 * most "normal" filesystems (but you don't /have/ to use this: 9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/config.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/uaccess.h>
17#include <linux/aio.h> 17#include <linux/aio.h>
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
@@ -38,7 +38,6 @@
38 */ 38 */
39#include <linux/buffer_head.h> /* for generic_osync_inode */ 39#include <linux/buffer_head.h> /* for generic_osync_inode */
40 40
41#include <asm/uaccess.h>
42#include <asm/mman.h> 41#include <asm/mman.h>
43 42
44static ssize_t 43static ssize_t
@@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page)
120 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
121 page->mapping = NULL; 120 page->mapping = NULL;
122 mapping->nrpages--; 121 mapping->nrpages--;
123 pagecache_acct(-1); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
124} 123}
125 124
126void remove_from_page_cache(struct page *page) 125void remove_from_page_cache(struct page *page)
@@ -171,15 +170,17 @@ static int sync_page(void *word)
171} 170}
172 171
173/** 172/**
174 * filemap_fdatawrite_range - start writeback against all of a mapping's 173 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
175 * dirty pages that lie within the byte offsets <start, end>
176 * @mapping: address space structure to write 174 * @mapping: address space structure to write
177 * @start: offset in bytes where the range starts 175 * @start: offset in bytes where the range starts
178 * @end: offset in bytes where the range ends (inclusive) 176 * @end: offset in bytes where the range ends (inclusive)
179 * @sync_mode: enable synchronous operation 177 * @sync_mode: enable synchronous operation
180 * 178 *
179 * Start writeback against all of a mapping's dirty pages that lie
180 * within the byte offsets <start, end> inclusive.
181 *
181 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 182 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
182 * opposed to a regular memory * cleansing writeback. The difference between 183 * opposed to a regular memory cleansing writeback. The difference between
183 * these two operations is that if a dirty page/buffer is encountered, it must 184 * these two operations is that if a dirty page/buffer is encountered, it must
184 * be waited upon, and not just skipped over. 185 * be waited upon, and not just skipped over.
185 */ 186 */
@@ -190,8 +191,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
190 struct writeback_control wbc = { 191 struct writeback_control wbc = {
191 .sync_mode = sync_mode, 192 .sync_mode = sync_mode,
192 .nr_to_write = mapping->nrpages * 2, 193 .nr_to_write = mapping->nrpages * 2,
193 .start = start, 194 .range_start = start,
194 .end = end, 195 .range_end = end,
195 }; 196 };
196 197
197 if (!mapping_cap_writeback_dirty(mapping)) 198 if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +205,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
204static inline int __filemap_fdatawrite(struct address_space *mapping, 205static inline int __filemap_fdatawrite(struct address_space *mapping,
205 int sync_mode) 206 int sync_mode)
206{ 207{
207 return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); 208 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
208} 209}
209 210
210int filemap_fdatawrite(struct address_space *mapping) 211int filemap_fdatawrite(struct address_space *mapping)
@@ -219,7 +220,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 220 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
220} 221}
221 222
222/* 223/**
224 * filemap_flush - mostly a non-blocking flush
225 * @mapping: target address_space
226 *
223 * This is a mostly non-blocking flush. Not suitable for data-integrity 227 * This is a mostly non-blocking flush. Not suitable for data-integrity
224 * purposes - I/O may not be started against all dirty pages. 228 * purposes - I/O may not be started against all dirty pages.
225 */ 229 */
@@ -229,7 +233,12 @@ int filemap_flush(struct address_space *mapping)
229} 233}
230EXPORT_SYMBOL(filemap_flush); 234EXPORT_SYMBOL(filemap_flush);
231 235
232/* 236/**
237 * wait_on_page_writeback_range - wait for writeback to complete
238 * @mapping: target address_space
239 * @start: beginning page index
240 * @end: ending page index
241 *
233 * Wait for writeback to complete against pages indexed by start->end 242 * Wait for writeback to complete against pages indexed by start->end
234 * inclusive 243 * inclusive
235 */ 244 */
@@ -276,7 +285,13 @@ int wait_on_page_writeback_range(struct address_space *mapping,
276 return ret; 285 return ret;
277} 286}
278 287
279/* 288/**
289 * sync_page_range - write and wait on all pages in the passed range
290 * @inode: target inode
291 * @mapping: target address_space
292 * @pos: beginning offset in pages to write
293 * @count: number of bytes to write
294 *
280 * Write and wait upon all the pages in the passed range. This is a "data 295 * Write and wait upon all the pages in the passed range. This is a "data
281 * integrity" operation. It waits upon in-flight writeout before starting and 296 * integrity" operation. It waits upon in-flight writeout before starting and
282 * waiting upon new writeout. If there was an IO error, return it. 297 * waiting upon new writeout. If there was an IO error, return it.
@@ -305,7 +320,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
305} 320}
306EXPORT_SYMBOL(sync_page_range); 321EXPORT_SYMBOL(sync_page_range);
307 322
308/* 323/**
324 * sync_page_range_nolock
325 * @inode: target inode
326 * @mapping: target address_space
327 * @pos: beginning offset in pages to write
328 * @count: number of bytes to write
329 *
309 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea 330 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
310 * as it forces O_SYNC writers to different parts of the same file 331 * as it forces O_SYNC writers to different parts of the same file
311 * to be serialised right until io completion. 332 * to be serialised right until io completion.
@@ -329,10 +350,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
329EXPORT_SYMBOL(sync_page_range_nolock); 350EXPORT_SYMBOL(sync_page_range_nolock);
330 351
331/** 352/**
332 * filemap_fdatawait - walk the list of under-writeback pages of the given 353 * filemap_fdatawait - wait for all under-writeback pages to complete
333 * address space and wait for all of them.
334 *
335 * @mapping: address space structure to wait for 354 * @mapping: address space structure to wait for
355 *
356 * Walk the list of under-writeback pages of the given address space
357 * and wait for all of them.
336 */ 358 */
337int filemap_fdatawait(struct address_space *mapping) 359int filemap_fdatawait(struct address_space *mapping)
338{ 360{
@@ -368,7 +390,12 @@ int filemap_write_and_wait(struct address_space *mapping)
368} 390}
369EXPORT_SYMBOL(filemap_write_and_wait); 391EXPORT_SYMBOL(filemap_write_and_wait);
370 392
371/* 393/**
394 * filemap_write_and_wait_range - write out & wait on a file range
395 * @mapping: the address_space for the pages
396 * @lstart: offset in bytes where the range starts
397 * @lend: offset in bytes where the range ends (inclusive)
398 *
372 * Write out and wait upon file offsets lstart->lend, inclusive. 399 * Write out and wait upon file offsets lstart->lend, inclusive.
373 * 400 *
374 * Note that `lend' is inclusive (describes the last byte to be written) so 401 * Note that `lend' is inclusive (describes the last byte to be written) so
@@ -394,8 +421,14 @@ int filemap_write_and_wait_range(struct address_space *mapping,
394 return err; 421 return err;
395} 422}
396 423
397/* 424/**
398 * This function is used to add newly allocated pagecache pages: 425 * add_to_page_cache - add newly allocated pagecache pages
426 * @page: page to add
427 * @mapping: the page's address_space
428 * @offset: page index
429 * @gfp_mask: page allocation mode
430 *
431 * This function is used to add newly allocated pagecache pages;
399 * the page is new, so we can just run SetPageLocked() against it. 432 * the page is new, so we can just run SetPageLocked() against it.
400 * The other page state flags were set by rmqueue(). 433 * The other page state flags were set by rmqueue().
401 * 434 *
@@ -415,14 +448,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
415 page->mapping = mapping; 448 page->mapping = mapping;
416 page->index = offset; 449 page->index = offset;
417 mapping->nrpages++; 450 mapping->nrpages++;
418 pagecache_acct(1); 451 __inc_zone_page_state(page, NR_FILE_PAGES);
419 } 452 }
420 write_unlock_irq(&mapping->tree_lock); 453 write_unlock_irq(&mapping->tree_lock);
421 radix_tree_preload_end(); 454 radix_tree_preload_end();
422 } 455 }
423 return error; 456 return error;
424} 457}
425
426EXPORT_SYMBOL(add_to_page_cache); 458EXPORT_SYMBOL(add_to_page_cache);
427 459
428int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 460int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -489,8 +521,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr)
489EXPORT_SYMBOL(wait_on_page_bit); 521EXPORT_SYMBOL(wait_on_page_bit);
490 522
491/** 523/**
492 * unlock_page() - unlock a locked page 524 * unlock_page - unlock a locked page
493 *
494 * @page: the page 525 * @page: the page
495 * 526 *
496 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 527 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
@@ -513,8 +544,9 @@ void fastcall unlock_page(struct page *page)
513} 544}
514EXPORT_SYMBOL(unlock_page); 545EXPORT_SYMBOL(unlock_page);
515 546
516/* 547/**
517 * End writeback against a page. 548 * end_page_writeback - end writeback against a page
549 * @page: the page
518 */ 550 */
519void end_page_writeback(struct page *page) 551void end_page_writeback(struct page *page)
520{ 552{
@@ -527,10 +559,11 @@ void end_page_writeback(struct page *page)
527} 559}
528EXPORT_SYMBOL(end_page_writeback); 560EXPORT_SYMBOL(end_page_writeback);
529 561
530/* 562/**
531 * Get a lock on the page, assuming we need to sleep to get it. 563 * __lock_page - get a lock on the page, assuming we need to sleep to get it
564 * @page: the page to lock
532 * 565 *
533 * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some 566 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
534 * random driver's requestfn sets TASK_RUNNING, we could busywait. However 567 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
535 * chances are that on the second loop, the block layer's plug list is empty, 568 * chances are that on the second loop, the block layer's plug list is empty,
536 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 569 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
@@ -544,8 +577,12 @@ void fastcall __lock_page(struct page *page)
544} 577}
545EXPORT_SYMBOL(__lock_page); 578EXPORT_SYMBOL(__lock_page);
546 579
547/* 580/**
548 * a rather lightweight function, finding and getting a reference to a 581 * find_get_page - find and get a page reference
582 * @mapping: the address_space to search
583 * @offset: the page index
584 *
585 * A rather lightweight function, finding and getting a reference to a
549 * hashed page atomically. 586 * hashed page atomically.
550 */ 587 */
551struct page * find_get_page(struct address_space *mapping, unsigned long offset) 588struct page * find_get_page(struct address_space *mapping, unsigned long offset)
@@ -559,11 +596,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
559 read_unlock_irq(&mapping->tree_lock); 596 read_unlock_irq(&mapping->tree_lock);
560 return page; 597 return page;
561} 598}
562
563EXPORT_SYMBOL(find_get_page); 599EXPORT_SYMBOL(find_get_page);
564 600
565/* 601/**
566 * Same as above, but trylock it instead of incrementing the count. 602 * find_trylock_page - find and lock a page
603 * @mapping: the address_space to search
604 * @offset: the page index
605 *
606 * Same as find_get_page(), but trylock it instead of incrementing the count.
567 */ 607 */
568struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) 608struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
569{ 609{
@@ -576,12 +616,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
576 read_unlock_irq(&mapping->tree_lock); 616 read_unlock_irq(&mapping->tree_lock);
577 return page; 617 return page;
578} 618}
579
580EXPORT_SYMBOL(find_trylock_page); 619EXPORT_SYMBOL(find_trylock_page);
581 620
582/** 621/**
583 * find_lock_page - locate, pin and lock a pagecache page 622 * find_lock_page - locate, pin and lock a pagecache page
584 *
585 * @mapping: the address_space to search 623 * @mapping: the address_space to search
586 * @offset: the page index 624 * @offset: the page index
587 * 625 *
@@ -617,12 +655,10 @@ repeat:
617 read_unlock_irq(&mapping->tree_lock); 655 read_unlock_irq(&mapping->tree_lock);
618 return page; 656 return page;
619} 657}
620
621EXPORT_SYMBOL(find_lock_page); 658EXPORT_SYMBOL(find_lock_page);
622 659
623/** 660/**
624 * find_or_create_page - locate or add a pagecache page 661 * find_or_create_page - locate or add a pagecache page
625 *
626 * @mapping: the page's address_space 662 * @mapping: the page's address_space
627 * @index: the page's index into the mapping 663 * @index: the page's index into the mapping
628 * @gfp_mask: page allocation mode 664 * @gfp_mask: page allocation mode
@@ -663,7 +699,6 @@ repeat:
663 page_cache_release(cached_page); 699 page_cache_release(cached_page);
664 return page; 700 return page;
665} 701}
666
667EXPORT_SYMBOL(find_or_create_page); 702EXPORT_SYMBOL(find_or_create_page);
668 703
669/** 704/**
@@ -729,9 +764,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
729 return i; 764 return i;
730} 765}
731 766
732/* 767/**
768 * find_get_pages_tag - find and return pages that match @tag
769 * @mapping: the address_space to search
770 * @index: the starting page index
771 * @tag: the tag index
772 * @nr_pages: the maximum number of pages
773 * @pages: where the resulting pages are placed
774 *
733 * Like find_get_pages, except we only return pages which are tagged with 775 * Like find_get_pages, except we only return pages which are tagged with
734 * `tag'. We update *index to index the next page for the traversal. 776 * @tag. We update @index to index the next page for the traversal.
735 */ 777 */
736unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 778unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
737 int tag, unsigned int nr_pages, struct page **pages) 779 int tag, unsigned int nr_pages, struct page **pages)
@@ -750,7 +792,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
750 return ret; 792 return ret;
751} 793}
752 794
753/* 795/**
796 * grab_cache_page_nowait - returns locked page at given index in given cache
797 * @mapping: target address_space
798 * @index: the page index
799 *
754 * Same as grab_cache_page, but do not wait if the page is unavailable. 800 * Same as grab_cache_page, but do not wait if the page is unavailable.
755 * This is intended for speculative data generators, where the data can 801 * This is intended for speculative data generators, where the data can
756 * be regenerated if the page couldn't be grabbed. This routine should 802 * be regenerated if the page couldn't be grabbed. This routine should
@@ -779,19 +825,51 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
779 } 825 }
780 return page; 826 return page;
781} 827}
782
783EXPORT_SYMBOL(grab_cache_page_nowait); 828EXPORT_SYMBOL(grab_cache_page_nowait);
784 829
785/* 830/*
831 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
832 * a _large_ part of the i/o request. Imagine the worst scenario:
833 *
834 * ---R__________________________________________B__________
835 * ^ reading here ^ bad block(assume 4k)
836 *
837 * read(R) => miss => readahead(R...B) => media error => frustrating retries
838 * => failing the whole request => read(R) => read(R+1) =>
839 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
840 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
841 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
842 *
843 * It is going insane. Fix it by quickly scaling down the readahead size.
844 */
845static void shrink_readahead_size_eio(struct file *filp,
846 struct file_ra_state *ra)
847{
848 if (!ra->ra_pages)
849 return;
850
851 ra->ra_pages /= 4;
852 printk(KERN_WARNING "Reducing readahead size to %luK\n",
853 ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
854}
855
856/**
857 * do_generic_mapping_read - generic file read routine
858 * @mapping: address_space to be read
859 * @_ra: file's readahead state
860 * @filp: the file to read
861 * @ppos: current file position
862 * @desc: read_descriptor
863 * @actor: read method
864 *
786 * This is a generic file read routine, and uses the 865 * This is a generic file read routine, and uses the
787 * mapping->a_ops->readpage() function for the actual low-level 866 * mapping->a_ops->readpage() function for the actual low-level stuff.
788 * stuff.
789 * 867 *
790 * This is really ugly. But the goto's actually try to clarify some 868 * This is really ugly. But the goto's actually try to clarify some
791 * of the logic when it comes to error handling etc. 869 * of the logic when it comes to error handling etc.
792 * 870 *
793 * Note the struct file* is only passed for the use of readpage. It may be 871 * Note the struct file* is only passed for the use of readpage.
794 * NULL. 872 * It may be NULL.
795 */ 873 */
796void do_generic_mapping_read(struct address_space *mapping, 874void do_generic_mapping_read(struct address_space *mapping,
797 struct file_ra_state *_ra, 875 struct file_ra_state *_ra,
@@ -932,6 +1010,7 @@ readpage:
932 } 1010 }
933 unlock_page(page); 1011 unlock_page(page);
934 error = -EIO; 1012 error = -EIO;
1013 shrink_readahead_size_eio(filp, &ra);
935 goto readpage_error; 1014 goto readpage_error;
936 } 1015 }
937 unlock_page(page); 1016 unlock_page(page);
@@ -1004,7 +1083,6 @@ out:
1004 if (filp) 1083 if (filp)
1005 file_accessed(filp); 1084 file_accessed(filp);
1006} 1085}
1007
1008EXPORT_SYMBOL(do_generic_mapping_read); 1086EXPORT_SYMBOL(do_generic_mapping_read);
1009 1087
1010int file_read_actor(read_descriptor_t *desc, struct page *page, 1088int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1046,7 +1124,13 @@ success:
1046} 1124}
1047EXPORT_SYMBOL_GPL(file_read_actor); 1125EXPORT_SYMBOL_GPL(file_read_actor);
1048 1126
1049/* 1127/**
1128 * __generic_file_aio_read - generic filesystem read routine
1129 * @iocb: kernel I/O control block
1130 * @iov: io vector request
1131 * @nr_segs: number of segments in the iovec
1132 * @ppos: current file position
1133 *
1050 * This is the "read()" routine for all filesystems 1134 * This is the "read()" routine for all filesystems
1051 * that can use the page cache directly. 1135 * that can use the page cache directly.
1052 */ 1136 */
@@ -1125,7 +1209,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1125out: 1209out:
1126 return retval; 1210 return retval;
1127} 1211}
1128
1129EXPORT_SYMBOL(__generic_file_aio_read); 1212EXPORT_SYMBOL(__generic_file_aio_read);
1130 1213
1131ssize_t 1214ssize_t
@@ -1136,7 +1219,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
1136 BUG_ON(iocb->ki_pos != pos); 1219 BUG_ON(iocb->ki_pos != pos);
1137 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); 1220 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1138} 1221}
1139
1140EXPORT_SYMBOL(generic_file_aio_read); 1222EXPORT_SYMBOL(generic_file_aio_read);
1141 1223
1142ssize_t 1224ssize_t
@@ -1152,7 +1234,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo
1152 ret = wait_on_sync_kiocb(&kiocb); 1234 ret = wait_on_sync_kiocb(&kiocb);
1153 return ret; 1235 return ret;
1154} 1236}
1155
1156EXPORT_SYMBOL(generic_file_read); 1237EXPORT_SYMBOL(generic_file_read);
1157 1238
1158int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) 1239int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
@@ -1193,7 +1274,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1193 return desc.written; 1274 return desc.written;
1194 return desc.error; 1275 return desc.error;
1195} 1276}
1196
1197EXPORT_SYMBOL(generic_file_sendfile); 1277EXPORT_SYMBOL(generic_file_sendfile);
1198 1278
1199static ssize_t 1279static ssize_t
@@ -1229,11 +1309,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1229} 1309}
1230 1310
1231#ifdef CONFIG_MMU 1311#ifdef CONFIG_MMU
1232/* 1312static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1313/**
1314 * page_cache_read - adds requested page to the page cache if not already there
1315 * @file: file to read
1316 * @offset: page index
1317 *
1233 * This adds the requested page to the page cache if it isn't already there, 1318 * This adds the requested page to the page cache if it isn't already there,
1234 * and schedules an I/O to read in its contents from disk. 1319 * and schedules an I/O to read in its contents from disk.
1235 */ 1320 */
1236static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1237static int fastcall page_cache_read(struct file * file, unsigned long offset) 1321static int fastcall page_cache_read(struct file * file, unsigned long offset)
1238{ 1322{
1239 struct address_space *mapping = file->f_mapping; 1323 struct address_space *mapping = file->f_mapping;
@@ -1260,7 +1344,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
1260 1344
1261#define MMAP_LOTSAMISS (100) 1345#define MMAP_LOTSAMISS (100)
1262 1346
1263/* 1347/**
1348 * filemap_nopage - read in file data for page fault handling
1349 * @area: the applicable vm_area
1350 * @address: target address to read in
1351 * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
1352 *
1264 * filemap_nopage() is invoked via the vma operations vector for a 1353 * filemap_nopage() is invoked via the vma operations vector for a
1265 * mapped memory region to read in file data during a page fault. 1354 * mapped memory region to read in file data during a page fault.
1266 * 1355 *
@@ -1327,7 +1416,7 @@ retry_find:
1327 */ 1416 */
1328 if (!did_readaround) { 1417 if (!did_readaround) {
1329 majmin = VM_FAULT_MAJOR; 1418 majmin = VM_FAULT_MAJOR;
1330 inc_page_state(pgmajfault); 1419 count_vm_event(PGMAJFAULT);
1331 } 1420 }
1332 did_readaround = 1; 1421 did_readaround = 1;
1333 ra_pages = max_sane_readahead(file->f_ra.ra_pages); 1422 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
@@ -1398,7 +1487,7 @@ no_cached_page:
1398page_not_uptodate: 1487page_not_uptodate:
1399 if (!did_readaround) { 1488 if (!did_readaround) {
1400 majmin = VM_FAULT_MAJOR; 1489 majmin = VM_FAULT_MAJOR;
1401 inc_page_state(pgmajfault); 1490 count_vm_event(PGMAJFAULT);
1402 } 1491 }
1403 lock_page(page); 1492 lock_page(page);
1404 1493
@@ -1460,10 +1549,10 @@ page_not_uptodate:
1460 * Things didn't work out. Return zero to tell the 1549 * Things didn't work out. Return zero to tell the
1461 * mm layer so, possibly freeing the page cache page first. 1550 * mm layer so, possibly freeing the page cache page first.
1462 */ 1551 */
1552 shrink_readahead_size_eio(file, ra);
1463 page_cache_release(page); 1553 page_cache_release(page);
1464 return NULL; 1554 return NULL;
1465} 1555}
1466
1467EXPORT_SYMBOL(filemap_nopage); 1556EXPORT_SYMBOL(filemap_nopage);
1468 1557
1469static struct page * filemap_getpage(struct file *file, unsigned long pgoff, 1558static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
@@ -1717,7 +1806,13 @@ repeat:
1717 return page; 1806 return page;
1718} 1807}
1719 1808
1720/* 1809/**
1810 * read_cache_page - read into page cache, fill it if needed
1811 * @mapping: the page's address_space
1812 * @index: the page index
1813 * @filler: function to perform the read
1814 * @data: destination for read data
1815 *
1721 * Read into the page cache. If a page already exists, 1816 * Read into the page cache. If a page already exists,
1722 * and PageUptodate() is not set, try to fill the page. 1817 * and PageUptodate() is not set, try to fill the page.
1723 */ 1818 */
@@ -1755,7 +1850,6 @@ retry:
1755 out: 1850 out:
1756 return page; 1851 return page;
1757} 1852}
1758
1759EXPORT_SYMBOL(read_cache_page); 1853EXPORT_SYMBOL(read_cache_page);
1760 1854
1761/* 1855/*
@@ -1826,7 +1920,7 @@ int remove_suid(struct dentry *dentry)
1826EXPORT_SYMBOL(remove_suid); 1920EXPORT_SYMBOL(remove_suid);
1827 1921
1828size_t 1922size_t
1829__filemap_copy_from_user_iovec(char *vaddr, 1923__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1830 const struct iovec *iov, size_t base, size_t bytes) 1924 const struct iovec *iov, size_t base, size_t bytes)
1831{ 1925{
1832 size_t copied = 0, left = 0; 1926 size_t copied = 0, left = 0;
@@ -1836,18 +1930,14 @@ __filemap_copy_from_user_iovec(char *vaddr,
1836 int copy = min(bytes, iov->iov_len - base); 1930 int copy = min(bytes, iov->iov_len - base);
1837 1931
1838 base = 0; 1932 base = 0;
1839 left = __copy_from_user_inatomic(vaddr, buf, copy); 1933 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1840 copied += copy; 1934 copied += copy;
1841 bytes -= copy; 1935 bytes -= copy;
1842 vaddr += copy; 1936 vaddr += copy;
1843 iov++; 1937 iov++;
1844 1938
1845 if (unlikely(left)) { 1939 if (unlikely(left))
1846 /* zero the rest of the target like __copy_from_user */
1847 if (bytes)
1848 memset(vaddr, 0, bytes);
1849 break; 1940 break;
1850 }
1851 } 1941 }
1852 return copied - left; 1942 return copied - left;
1853} 1943}
@@ -1855,7 +1945,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
1855/* 1945/*
1856 * Performs necessary checks before doing a write 1946 * Performs necessary checks before doing a write
1857 * 1947 *
1858 * Can adjust writing position aor amount of bytes to write. 1948 * Can adjust writing position or amount of bytes to write.
1859 * Returns appropriate error code that caller should return or 1949 * Returns appropriate error code that caller should return or
1860 * zero in case that write should be allowed. 1950 * zero in case that write should be allowed.
1861 */ 1951 */
@@ -1979,7 +2069,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1979{ 2069{
1980 struct file *file = iocb->ki_filp; 2070 struct file *file = iocb->ki_filp;
1981 struct address_space * mapping = file->f_mapping; 2071 struct address_space * mapping = file->f_mapping;
1982 struct address_space_operations *a_ops = mapping->a_ops; 2072 const struct address_space_operations *a_ops = mapping->a_ops;
1983 struct inode *inode = mapping->host; 2073 struct inode *inode = mapping->host;
1984 long status = 0; 2074 long status = 0;
1985 struct page *page; 2075 struct page *page;
@@ -2005,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2005 do { 2095 do {
2006 unsigned long index; 2096 unsigned long index;
2007 unsigned long offset; 2097 unsigned long offset;
2008 unsigned long maxlen;
2009 size_t copied; 2098 size_t copied;
2010 2099
2011 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 2100 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2012 index = pos >> PAGE_CACHE_SHIFT; 2101 index = pos >> PAGE_CACHE_SHIFT;
2013 bytes = PAGE_CACHE_SIZE - offset; 2102 bytes = PAGE_CACHE_SIZE - offset;
2014 if (bytes > count) 2103
2015 bytes = count; 2104 /* Limit the size of the copy to the caller's write size */
2105 bytes = min(bytes, count);
2106
2107 /*
2108 * Limit the size of the copy to that of the current segment,
2109 * because fault_in_pages_readable() doesn't know how to walk
2110 * segments.
2111 */
2112 bytes = min(bytes, cur_iov->iov_len - iov_base);
2016 2113
2017 /* 2114 /*
2018 * Bring in the user page that we will copy from _first_. 2115 * Bring in the user page that we will copy from _first_.
@@ -2020,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2020 * same page as we're writing to, without it being marked 2117 * same page as we're writing to, without it being marked
2021 * up-to-date. 2118 * up-to-date.
2022 */ 2119 */
2023 maxlen = cur_iov->iov_len - iov_base; 2120 fault_in_pages_readable(buf, bytes);
2024 if (maxlen > bytes)
2025 maxlen = bytes;
2026 fault_in_pages_readable(buf, maxlen);
2027 2121
2028 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); 2122 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
2029 if (!page) { 2123 if (!page) {
@@ -2031,6 +2125,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2031 break; 2125 break;
2032 } 2126 }
2033 2127
2128 if (unlikely(bytes == 0)) {
2129 status = 0;
2130 copied = 0;
2131 goto zero_length_segment;
2132 }
2133
2034 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2134 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2035 if (unlikely(status)) { 2135 if (unlikely(status)) {
2036 loff_t isize = i_size_read(inode); 2136 loff_t isize = i_size_read(inode);
@@ -2060,7 +2160,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2060 page_cache_release(page); 2160 page_cache_release(page);
2061 continue; 2161 continue;
2062 } 2162 }
2063 if (likely(copied > 0)) { 2163zero_length_segment:
2164 if (likely(copied >= 0)) {
2064 if (!status) 2165 if (!status)
2065 status = copied; 2166 status = copied;
2066 2167
@@ -2125,7 +2226,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2125 unsigned long nr_segs, loff_t *ppos) 2226 unsigned long nr_segs, loff_t *ppos)
2126{ 2227{
2127 struct file *file = iocb->ki_filp; 2228 struct file *file = iocb->ki_filp;
2128 struct address_space * mapping = file->f_mapping; 2229 const struct address_space * mapping = file->f_mapping;
2129 size_t ocount; /* original count */ 2230 size_t ocount; /* original count */
2130 size_t count; /* after file limit checks */ 2231 size_t count; /* after file limit checks */
2131 struct inode *inode = mapping->host; 2232 struct inode *inode = mapping->host;
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..3f2a343c6015 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,18 +13,26 @@
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/config.h> 15#include <linux/config.h>
16#include <asm/uaccess.h> 16#include <linux/uaccess.h>
17 17
18size_t 18size_t
19__filemap_copy_from_user_iovec(char *vaddr, 19__filemap_copy_from_user_iovec_inatomic(char *vaddr,
20 const struct iovec *iov, 20 const struct iovec *iov,
21 size_t base, 21 size_t base,
22 size_t bytes); 22 size_t bytes);
23 23
24/* 24/*
25 * Copy as much as we can into the page and return the number of bytes which 25 * Copy as much as we can into the page and return the number of bytes which
26 * were sucessfully copied. If a fault is encountered then clear the page 26 * were sucessfully copied. If a fault is encountered then clear the page
27 * out to (offset+bytes) and return the number of bytes which were copied. 27 * out to (offset+bytes) and return the number of bytes which were copied.
28 *
29 * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
30 * to *NOT* zero any tail of the buffer that it failed to copy. If it does,
31 * and if the following non-atomic copy succeeds, then there is a small window
32 * where the target page contains neither the data before the write, nor the
33 * data after the write (it contains zero). A read at this time will see
34 * data that is inconsistent with any ordering of the read and the write.
35 * (This has been detected in practice).
28 */ 36 */
29static inline size_t 37static inline size_t
30filemap_copy_from_user(struct page *page, unsigned long offset, 38filemap_copy_from_user(struct page *page, unsigned long offset,
@@ -34,13 +42,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
34 int left; 42 int left;
35 43
36 kaddr = kmap_atomic(page, KM_USER0); 44 kaddr = kmap_atomic(page, KM_USER0);
37 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 45 left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
38 kunmap_atomic(kaddr, KM_USER0); 46 kunmap_atomic(kaddr, KM_USER0);
39 47
40 if (left != 0) { 48 if (left != 0) {
41 /* Do it the slow way */ 49 /* Do it the slow way */
42 kaddr = kmap(page); 50 kaddr = kmap(page);
43 left = __copy_from_user(kaddr + offset, buf, bytes); 51 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
44 kunmap(page); 52 kunmap(page);
45 } 53 }
46 return bytes - left; 54 return bytes - left;
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
60 size_t copied; 68 size_t copied;
61 69
62 kaddr = kmap_atomic(page, KM_USER0); 70 kaddr = kmap_atomic(page, KM_USER0);
63 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, 71 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
64 base, bytes); 72 base, bytes);
65 kunmap_atomic(kaddr, KM_USER0); 73 kunmap_atomic(kaddr, KM_USER0);
66 if (copied != bytes) { 74 if (copied != bytes) {
67 kaddr = kmap(page); 75 kaddr = kmap(page);
68 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, 76 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
69 base, bytes); 77 base, bytes);
78 if (bytes - copied)
79 memset(kaddr + offset + copied, 0, bytes - copied);
70 kunmap(page); 80 kunmap(page);
71 } 81 }
72 return copied; 82 return copied;
@@ -78,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
78 const struct iovec *iov = *iovp; 88 const struct iovec *iov = *iovp;
79 size_t base = *basep; 89 size_t base = *basep;
80 90
81 while (bytes) { 91 do {
82 int copy = min(bytes, iov->iov_len - base); 92 int copy = min(bytes, iov->iov_len - base);
83 93
84 bytes -= copy; 94 bytes -= copy;
@@ -87,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
87 iov++; 97 iov++;
88 base = 0; 98 base = 0;
89 } 99 }
90 } 100 } while (bytes);
91 *iovp = iov; 101 *iovp = iov;
92 *basep = base; 102 *basep = base;
93} 103}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b960ac8e5918..b4fd0d7c9bfb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
273 size_t count, loff_t pos, loff_t *ppos) 273 size_t count, loff_t pos, loff_t *ppos)
274{ 274{
275 struct address_space * mapping = filp->f_mapping; 275 struct address_space * mapping = filp->f_mapping;
276 struct address_space_operations *a_ops = mapping->a_ops; 276 const struct address_space_operations *a_ops = mapping->a_ops;
277 struct inode *inode = mapping->host; 277 struct inode *inode = mapping->host;
278 long status = 0; 278 long status = 0;
279 struct page *page; 279 struct page *page;
diff --git a/mm/fremap.c b/mm/fremap.c
index 9f381e58bf44..21b7d0cbc98c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
83 page_add_file_rmap(page); 83 page_add_file_rmap(page);
84 pte_val = *pte; 84 pte_val = *pte;
85 update_mmu_cache(vma, addr, pte_val); 85 update_mmu_cache(vma, addr, pte_val);
86 lazy_mmu_prot_update(pte_val);
86 err = 0; 87 err = 0;
87unlock: 88unlock:
88 pte_unmap_unlock(pte, ptl); 89 pte_unmap_unlock(pte, ptl);
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
114 115
115 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 116 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
116 pte_val = *pte; 117 pte_val = *pte;
117 update_mmu_cache(vma, addr, pte_val); 118 /*
119 * We don't need to run update_mmu_cache() here because the "file pte"
120 * being installed by install_file_pte() is not a real pte - it's a
121 * non-present entry (like a swap entry), noting what file offset should
122 * be mapped there when there's a fault (in a non-linear vma where
123 * that's not obvious).
124 */
118 pte_unmap_unlock(pte, ptl); 125 pte_unmap_unlock(pte, ptl);
119 err = 0; 126 err = 0;
120out: 127out:
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b274fdf9d08..9b2a5403c447 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -315,8 +315,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
315 if (bvec->bv_page == org_vec->bv_page) 315 if (bvec->bv_page == org_vec->bv_page)
316 continue; 316 continue;
317 317
318 mempool_free(bvec->bv_page, pool); 318 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
319 dec_page_state(nr_bounce); 319 mempool_free(bvec->bv_page, pool);
320 } 320 }
321 321
322 bio_endio(bio_orig, bio_orig->bi_size, err); 322 bio_endio(bio_orig, bio_orig->bi_size, err);
@@ -397,7 +397,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
397 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 397 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
398 to->bv_len = from->bv_len; 398 to->bv_len = from->bv_len;
399 to->bv_offset = from->bv_offset; 399 to->bv_offset = from->bv_offset;
400 inc_page_state(nr_bounce); 400 inc_zone_page_state(to->bv_page, NR_BOUNCE);
401 401
402 if (rw == WRITE) { 402 if (rw == WRITE) {
403 char *vto, *vfrom; 403 char *vto, *vfrom;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26unsigned long max_huge_pages; 26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
123static struct page *alloc_huge_page(struct vm_area_struct *vma, 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr) 124 unsigned long addr)
125{ 125{
126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page; 126 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
130 127
131 spin_lock(&hugetlb_lock); 128 spin_lock(&hugetlb_lock);
132 129 if (vma->vm_flags & VM_MAYSHARE)
133 if (vma->vm_flags & VM_MAYSHARE) { 130 resv_huge_pages--;
134 131 else if (free_huge_pages <= resv_huge_pages)
135 /* idx = radix tree index, i.e. offset into file in 132 goto fail;
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159 133
160 page = dequeue_huge_page(vma, addr); 134 page = dequeue_huge_page(vma, addr);
161 if (!page) 135 if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
165 set_page_refcounted(page); 139 set_page_refcounted(page);
166 return page; 140 return page;
167 141
168 fail: 142fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock); 143 spin_unlock(&hugetlb_lock);
171 return NULL; 144 return NULL;
172} 145}
173 146
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
189
190 spin_lock(&hugetlb_lock);
191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
211 spin_unlock(&hugetlb_lock);
212
213 return ret;
214}
215
216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
230 struct page *page;
231
232 spin_lock(&hugetlb_lock);
233 read_lock_irq(&inode->i_mapping->tree_lock);
234
235 if (info->prereserved_hpages <= atmost)
236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
255 spin_unlock(&hugetlb_lock);
256}
257
258static int __init hugetlb_init(void) 147static int __init hugetlb_init(void)
259{ 148{
260 unsigned long i; 149 unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
334 return nr_huge_pages; 223 return nr_huge_pages;
335 224
336 spin_lock(&hugetlb_lock); 225 spin_lock(&hugetlb_lock);
337 count = max(count, reserved_huge_pages); 226 count = max(count, resv_huge_pages);
338 try_to_free_low(count); 227 try_to_free_low(count);
339 while (count < nr_huge_pages) { 228 while (count < nr_huge_pages) {
340 struct page *page = dequeue_huge_page(NULL, 0); 229 struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
361 return sprintf(buf, 250 return sprintf(buf,
362 "HugePages_Total: %5lu\n" 251 "HugePages_Total: %5lu\n"
363 "HugePages_Free: %5lu\n" 252 "HugePages_Free: %5lu\n"
364 "HugePages_Rsvd: %5lu\n" 253 "HugePages_Rsvd: %5lu\n"
365 "Hugepagesize: %5lu kB\n", 254 "Hugepagesize: %5lu kB\n",
366 nr_huge_pages, 255 nr_huge_pages,
367 free_huge_pages, 256 free_huge_pages,
368 reserved_huge_pages, 257 resv_huge_pages,
369 HPAGE_SIZE/1024); 258 HPAGE_SIZE/1024);
370} 259}
371 260
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
754 flush_tlb_range(vma, start, end); 643 flush_tlb_range(vma, start, end);
755} 644}
756 645
646struct file_region {
647 struct list_head link;
648 long from;
649 long to;
650};
651
652static long region_add(struct list_head *head, long f, long t)
653{
654 struct file_region *rg, *nrg, *trg;
655
656 /* Locate the region we are either in or before. */
657 list_for_each_entry(rg, head, link)
658 if (f <= rg->to)
659 break;
660
661 /* Round our left edge to the current segment if it encloses us. */
662 if (f > rg->from)
663 f = rg->from;
664
665 /* Check for and consume any regions we now overlap with. */
666 nrg = rg;
667 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
668 if (&rg->link == head)
669 break;
670 if (rg->from > t)
671 break;
672
673 /* If this area reaches higher then extend our area to
674 * include it completely. If this is not the first area
675 * which we intend to reuse, free it. */
676 if (rg->to > t)
677 t = rg->to;
678 if (rg != nrg) {
679 list_del(&rg->link);
680 kfree(rg);
681 }
682 }
683 nrg->from = f;
684 nrg->to = t;
685 return 0;
686}
687
688static long region_chg(struct list_head *head, long f, long t)
689{
690 struct file_region *rg, *nrg;
691 long chg = 0;
692
693 /* Locate the region we are before or in. */
694 list_for_each_entry(rg, head, link)
695 if (f <= rg->to)
696 break;
697
698 /* If we are below the current region then a new region is required.
699 * Subtle, allocate a new region at the position but make it zero
700 * size such that we can guarentee to record the reservation. */
701 if (&rg->link == head || t < rg->from) {
702 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
703 if (nrg == 0)
704 return -ENOMEM;
705 nrg->from = f;
706 nrg->to = f;
707 INIT_LIST_HEAD(&nrg->link);
708 list_add(&nrg->link, rg->link.prev);
709
710 return t - f;
711 }
712
713 /* Round our left edge to the current segment if it encloses us. */
714 if (f > rg->from)
715 f = rg->from;
716 chg = t - f;
717
718 /* Check for and consume any regions we now overlap with. */
719 list_for_each_entry(rg, rg->link.prev, link) {
720 if (&rg->link == head)
721 break;
722 if (rg->from > t)
723 return chg;
724
725 /* We overlap with this area, if it extends futher than
726 * us then we must extend ourselves. Account for its
727 * existing reservation. */
728 if (rg->to > t) {
729 chg += rg->to - t;
730 t = rg->to;
731 }
732 chg -= rg->to - rg->from;
733 }
734 return chg;
735}
736
737static long region_truncate(struct list_head *head, long end)
738{
739 struct file_region *rg, *trg;
740 long chg = 0;
741
742 /* Locate the region we are either in or before. */
743 list_for_each_entry(rg, head, link)
744 if (end <= rg->to)
745 break;
746 if (&rg->link == head)
747 return 0;
748
749 /* If we are in the middle of a region then adjust it. */
750 if (end > rg->from) {
751 chg = rg->to - end;
752 rg->to = end;
753 rg = list_entry(rg->link.next, typeof(*rg), link);
754 }
755
756 /* Drop any remaining regions. */
757 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
758 if (&rg->link == head)
759 break;
760 chg += rg->to - rg->from;
761 list_del(&rg->link);
762 kfree(rg);
763 }
764 return chg;
765}
766
767static int hugetlb_acct_memory(long delta)
768{
769 int ret = -ENOMEM;
770
771 spin_lock(&hugetlb_lock);
772 if ((delta + resv_huge_pages) <= free_huge_pages) {
773 resv_huge_pages += delta;
774 ret = 0;
775 }
776 spin_unlock(&hugetlb_lock);
777 return ret;
778}
779
780int hugetlb_reserve_pages(struct inode *inode, long from, long to)
781{
782 long ret, chg;
783
784 chg = region_chg(&inode->i_mapping->private_list, from, to);
785 if (chg < 0)
786 return chg;
787 ret = hugetlb_acct_memory(chg);
788 if (ret < 0)
789 return ret;
790 region_add(&inode->i_mapping->private_list, from, to);
791 return 0;
792}
793
794void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
795{
796 long chg = region_truncate(&inode->i_mapping->private_list, offset);
797 hugetlb_acct_memory(freed - chg);
798}
diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc644271..7e2a4b1580e3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -126,7 +126,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
126 pmd_clear(pmd); 126 pmd_clear(pmd);
127 pte_lock_deinit(page); 127 pte_lock_deinit(page);
128 pte_free_tlb(tlb, page); 128 pte_free_tlb(tlb, page);
129 dec_page_state(nr_page_table_pages); 129 dec_zone_page_state(page, NR_PAGETABLE);
130 tlb->mm->nr_ptes--; 130 tlb->mm->nr_ptes--;
131} 131}
132 132
@@ -311,7 +311,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
311 pte_free(new); 311 pte_free(new);
312 } else { 312 } else {
313 mm->nr_ptes++; 313 mm->nr_ptes++;
314 inc_page_state(nr_page_table_pages); 314 inc_zone_page_state(new, NR_PAGETABLE);
315 pmd_populate(mm, pmd, new); 315 pmd_populate(mm, pmd, new);
316 } 316 }
317 spin_unlock(&mm->page_table_lock); 317 spin_unlock(&mm->page_table_lock);
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
434 /* pte contains position in swap or file, so copy. */ 434 /* pte contains position in swap or file, so copy. */
435 if (unlikely(!pte_present(pte))) { 435 if (unlikely(!pte_present(pte))) {
436 if (!pte_file(pte)) { 436 if (!pte_file(pte)) {
437 swap_duplicate(pte_to_swp_entry(pte)); 437 swp_entry_t entry = pte_to_swp_entry(pte);
438
439 swap_duplicate(entry);
438 /* make sure dst_mm is on swapoff's mmlist. */ 440 /* make sure dst_mm is on swapoff's mmlist. */
439 if (unlikely(list_empty(&dst_mm->mmlist))) { 441 if (unlikely(list_empty(&dst_mm->mmlist))) {
440 spin_lock(&mmlist_lock); 442 spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
443 &src_mm->mmlist); 445 &src_mm->mmlist);
444 spin_unlock(&mmlist_lock); 446 spin_unlock(&mmlist_lock);
445 } 447 }
448 if (is_write_migration_entry(entry) &&
449 is_cow_mapping(vm_flags)) {
450 /*
451 * COW mappings require pages in both parent
452 * and child to be set to read.
453 */
454 make_migration_entry_read(&entry);
455 pte = swp_entry_to_pte(entry);
456 set_pte_at(src_mm, addr, src_pte, pte);
457 }
446 } 458 }
447 goto out_set_pte; 459 goto out_set_pte;
448 } 460 }
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1445{ 1457{
1446 struct page *old_page, *new_page; 1458 struct page *old_page, *new_page;
1447 pte_t entry; 1459 pte_t entry;
1448 int ret = VM_FAULT_MINOR; 1460 int reuse, ret = VM_FAULT_MINOR;
1449 1461
1450 old_page = vm_normal_page(vma, address, orig_pte); 1462 old_page = vm_normal_page(vma, address, orig_pte);
1451 if (!old_page) 1463 if (!old_page)
1452 goto gotten; 1464 goto gotten;
1453 1465
1454 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1466 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
1455 int reuse = can_share_swap_page(old_page); 1467 (VM_SHARED|VM_WRITE))) {
1456 unlock_page(old_page); 1468 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1457 if (reuse) { 1469 /*
1458 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1470 * Notify the address space that the page is about to
1459 entry = pte_mkyoung(orig_pte); 1471 * become writable so that it can prohibit this or wait
1460 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1472 * for the page to get into an appropriate state.
1461 ptep_set_access_flags(vma, address, page_table, entry, 1); 1473 *
1462 update_mmu_cache(vma, address, entry); 1474 * We do this without the lock held, so that it can
1463 lazy_mmu_prot_update(entry); 1475 * sleep if it needs to.
1464 ret |= VM_FAULT_WRITE; 1476 */
1465 goto unlock; 1477 page_cache_get(old_page);
1478 pte_unmap_unlock(page_table, ptl);
1479
1480 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1481 goto unwritable_page;
1482
1483 page_cache_release(old_page);
1484
1485 /*
1486 * Since we dropped the lock we need to revalidate
1487 * the PTE as someone else may have changed it. If
1488 * they did, we just return, as we can count on the
1489 * MMU to tell us if they didn't also make it writable.
1490 */
1491 page_table = pte_offset_map_lock(mm, pmd, address,
1492 &ptl);
1493 if (!pte_same(*page_table, orig_pte))
1494 goto unlock;
1466 } 1495 }
1496
1497 reuse = 1;
1498 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1499 reuse = can_share_swap_page(old_page);
1500 unlock_page(old_page);
1501 } else {
1502 reuse = 0;
1503 }
1504
1505 if (reuse) {
1506 flush_cache_page(vma, address, pte_pfn(orig_pte));
1507 entry = pte_mkyoung(orig_pte);
1508 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1509 ptep_set_access_flags(vma, address, page_table, entry, 1);
1510 update_mmu_cache(vma, address, entry);
1511 lazy_mmu_prot_update(entry);
1512 ret |= VM_FAULT_WRITE;
1513 goto unlock;
1467 } 1514 }
1468 1515
1469 /* 1516 /*
@@ -1523,6 +1570,10 @@ oom:
1523 if (old_page) 1570 if (old_page)
1524 page_cache_release(old_page); 1571 page_cache_release(old_page);
1525 return VM_FAULT_OOM; 1572 return VM_FAULT_OOM;
1573
1574unwritable_page:
1575 page_cache_release(old_page);
1576 return VM_FAULT_SIGBUS;
1526} 1577}
1527 1578
1528/* 1579/*
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1879 goto out; 1930 goto out;
1880 1931
1881 entry = pte_to_swp_entry(orig_pte); 1932 entry = pte_to_swp_entry(orig_pte);
1882again: 1933 if (is_migration_entry(entry)) {
1934 migration_entry_wait(mm, pmd, address);
1935 goto out;
1936 }
1883 page = lookup_swap_cache(entry); 1937 page = lookup_swap_cache(entry);
1884 if (!page) { 1938 if (!page) {
1885 swapin_readahead(entry, address, vma); 1939 swapin_readahead(entry, address, vma);
@@ -1897,18 +1951,12 @@ again:
1897 1951
1898 /* Had to read the page from swap area: Major fault */ 1952 /* Had to read the page from swap area: Major fault */
1899 ret = VM_FAULT_MAJOR; 1953 ret = VM_FAULT_MAJOR;
1900 inc_page_state(pgmajfault); 1954 count_vm_event(PGMAJFAULT);
1901 grab_swap_token(); 1955 grab_swap_token();
1902 } 1956 }
1903 1957
1904 mark_page_accessed(page); 1958 mark_page_accessed(page);
1905 lock_page(page); 1959 lock_page(page);
1906 if (!PageSwapCache(page)) {
1907 /* Page migration has occured */
1908 unlock_page(page);
1909 page_cache_release(page);
1910 goto again;
1911 }
1912 1960
1913 /* 1961 /*
1914 * Back out if somebody else already faulted in this pte. 1962 * Back out if somebody else already faulted in this pte.
@@ -2074,18 +2122,31 @@ retry:
2074 /* 2122 /*
2075 * Should we do an early C-O-W break? 2123 * Should we do an early C-O-W break?
2076 */ 2124 */
2077 if (write_access && !(vma->vm_flags & VM_SHARED)) { 2125 if (write_access) {
2078 struct page *page; 2126 if (!(vma->vm_flags & VM_SHARED)) {
2127 struct page *page;
2079 2128
2080 if (unlikely(anon_vma_prepare(vma))) 2129 if (unlikely(anon_vma_prepare(vma)))
2081 goto oom; 2130 goto oom;
2082 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 2131 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2083 if (!page) 2132 if (!page)
2084 goto oom; 2133 goto oom;
2085 copy_user_highpage(page, new_page, address); 2134 copy_user_highpage(page, new_page, address);
2086 page_cache_release(new_page); 2135 page_cache_release(new_page);
2087 new_page = page; 2136 new_page = page;
2088 anon = 1; 2137 anon = 1;
2138
2139 } else {
2140 /* if the page will be shareable, see if the backing
2141 * address space wants to know that the page is about
2142 * to become writable */
2143 if (vma->vm_ops->page_mkwrite &&
2144 vma->vm_ops->page_mkwrite(vma, new_page) < 0
2145 ) {
2146 page_cache_release(new_page);
2147 return VM_FAULT_SIGBUS;
2148 }
2149 }
2089 } 2150 }
2090 2151
2091 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2152 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2263,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2263 2324
2264 __set_current_state(TASK_RUNNING); 2325 __set_current_state(TASK_RUNNING);
2265 2326
2266 inc_page_state(pgfault); 2327 count_vm_event(PGFAULT);
2267 2328
2268 if (unlikely(is_vm_hugetlb_page(vma))) 2329 if (unlikely(is_vm_hugetlb_page(vma)))
2269 return hugetlb_fault(mm, vma, address, write_access); 2330 return hugetlb_fault(mm, vma, address, write_access);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957e..01c9fb97c619 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 4 * Copyright (C)
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/stddef.h> 7#include <linux/stddef.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/swap.h> 9#include <linux/swap.h>
@@ -21,12 +20,13 @@
21#include <linux/memory_hotplug.h> 20#include <linux/memory_hotplug.h>
22#include <linux/highmem.h> 21#include <linux/highmem.h>
23#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/ioport.h>
24 24
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26 26
27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
28 unsigned long size); 28 unsigned long size);
29static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) 29static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
30{ 30{
31 struct pglist_data *pgdat = zone->zone_pgdat; 31 struct pglist_data *pgdat = zone->zone_pgdat;
32 int nr_pages = PAGES_PER_SECTION; 32 int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
34 int zone_type; 34 int zone_type;
35 35
36 zone_type = zone - pgdat->node_zones; 36 zone_type = zone - pgdat->node_zones;
37 if (!populated_zone(zone)) {
38 int ret = 0;
39 ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
40 if (ret < 0)
41 return ret;
42 }
37 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 43 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
38 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); 44 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
45 return 0;
39} 46}
40 47
41extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 48extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
50 if (ret < 0) 57 if (ret < 0)
51 return ret; 58 return ret;
52 59
53 __add_zone(zone, phys_start_pfn); 60 ret = __add_zone(zone, phys_start_pfn);
61
62 if (ret < 0)
63 return ret;
64
54 return register_new_memory(__pfn_to_section(phys_start_pfn)); 65 return register_new_memory(__pfn_to_section(phys_start_pfn));
55} 66}
56 67
@@ -115,7 +126,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
115 unsigned long i; 126 unsigned long i;
116 unsigned long flags; 127 unsigned long flags;
117 unsigned long onlined_pages = 0; 128 unsigned long onlined_pages = 0;
129 struct resource res;
130 u64 section_end;
131 unsigned long start_pfn;
118 struct zone *zone; 132 struct zone *zone;
133 int need_zonelists_rebuild = 0;
119 134
120 /* 135 /*
121 * This doesn't need a lock to do pfn_to_page(). 136 * This doesn't need a lock to do pfn_to_page().
@@ -128,15 +143,140 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
128 grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); 143 grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
129 pgdat_resize_unlock(zone->zone_pgdat, &flags); 144 pgdat_resize_unlock(zone->zone_pgdat, &flags);
130 145
131 for (i = 0; i < nr_pages; i++) { 146 /*
132 struct page *page = pfn_to_page(pfn + i); 147 * If this zone is not populated, then it is not in zonelist.
133 online_page(page); 148 * This means the page allocator ignores this zone.
134 onlined_pages++; 149 * So, zonelist must be updated after online.
150 */
151 if (!populated_zone(zone))
152 need_zonelists_rebuild = 1;
153
154 res.start = (u64)pfn << PAGE_SHIFT;
155 res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
156 res.flags = IORESOURCE_MEM; /* we just need system ram */
157 section_end = res.end;
158
159 while (find_next_system_ram(&res) >= 0) {
160 start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
161 nr_pages = (unsigned long)
162 ((res.end + 1 - res.start) >> PAGE_SHIFT);
163
164 if (PageReserved(pfn_to_page(start_pfn))) {
165 /* this region's page is not onlined now */
166 for (i = 0; i < nr_pages; i++) {
167 struct page *page = pfn_to_page(start_pfn + i);
168 online_page(page);
169 onlined_pages++;
170 }
171 }
172
173 res.start = res.end + 1;
174 res.end = section_end;
135 } 175 }
136 zone->present_pages += onlined_pages; 176 zone->present_pages += onlined_pages;
137 zone->zone_pgdat->node_present_pages += onlined_pages; 177 zone->zone_pgdat->node_present_pages += onlined_pages;
138 178
139 setup_per_zone_pages_min(); 179 setup_per_zone_pages_min();
140 180
181 if (need_zonelists_rebuild)
182 build_all_zonelists();
183 vm_total_pages = nr_free_pagecache_pages();
141 return 0; 184 return 0;
142} 185}
186
187static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
188{
189 struct pglist_data *pgdat;
190 unsigned long zones_size[MAX_NR_ZONES] = {0};
191 unsigned long zholes_size[MAX_NR_ZONES] = {0};
192 unsigned long start_pfn = start >> PAGE_SHIFT;
193
194 pgdat = arch_alloc_nodedata(nid);
195 if (!pgdat)
196 return NULL;
197
198 arch_refresh_nodedata(nid, pgdat);
199
200 /* we can use NODE_DATA(nid) from here */
201
202 /* init node's zones as empty zones, we don't have any present pages.*/
203 free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
204
205 return pgdat;
206}
207
208static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
209{
210 arch_refresh_nodedata(nid, NULL);
211 arch_free_nodedata(pgdat);
212 return;
213}
214
215/* add this memory to iomem resource */
216static void register_memory_resource(u64 start, u64 size)
217{
218 struct resource *res;
219
220 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
221 BUG_ON(!res);
222
223 res->name = "System RAM";
224 res->start = start;
225 res->end = start + size - 1;
226 res->flags = IORESOURCE_MEM;
227 if (request_resource(&iomem_resource, res) < 0) {
228 printk("System RAM resource %llx - %llx cannot be added\n",
229 (unsigned long long)res->start, (unsigned long long)res->end);
230 kfree(res);
231 }
232}
233
234
235
236int add_memory(int nid, u64 start, u64 size)
237{
238 pg_data_t *pgdat = NULL;
239 int new_pgdat = 0;
240 int ret;
241
242 if (!node_online(nid)) {
243 pgdat = hotadd_new_pgdat(nid, start);
244 if (!pgdat)
245 return -ENOMEM;
246 new_pgdat = 1;
247 ret = kswapd_run(nid);
248 if (ret)
249 goto error;
250 }
251
252 /* call arch's memory hotadd */
253 ret = arch_add_memory(nid, start, size);
254
255 if (ret < 0)
256 goto error;
257
258 /* we online node here. we can't roll back from here. */
259 node_set_online(nid);
260
261 if (new_pgdat) {
262 ret = register_one_node(nid);
263 /*
264 * If sysfs file of new node can't create, cpu on the node
265 * can't be hot-added. There is no rollback way now.
266 * So, check by BUG_ON() to catch it reluctantly..
267 */
268 BUG_ON(ret);
269 }
270
271 /* register this memory as resource */
272 register_memory_resource(start, size);
273
274 return ret;
275error:
276 /* rollback pgdat allocation and others */
277 if (new_pgdat)
278 rollback_node_hotadd(nid, pgdat);
279
280 return ret;
281}
282EXPORT_SYMBOL_GPL(add_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c4..e07e27e846a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,8 @@
87#include <linux/seq_file.h> 87#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 88#include <linux/proc_fs.h>
89#include <linux/migrate.h> 89#include <linux/migrate.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
90 92
91#include <asm/tlbflush.h> 93#include <asm/tlbflush.h>
92#include <asm/uaccess.h> 94#include <asm/uaccess.h>
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
587 isolate_lru_page(page, pagelist); 589 isolate_lru_page(page, pagelist);
588} 590}
589 591
592static struct page *new_node_page(struct page *page, unsigned long node, int **x)
593{
594 return alloc_pages_node(node, GFP_HIGHUSER, 0);
595}
596
590/* 597/*
591 * Migrate pages from one node to a target node. 598 * Migrate pages from one node to a target node.
592 * Returns error or the number of pages not migrated. 599 * Returns error or the number of pages not migrated.
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
603 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 610 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
604 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 611 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
605 612
606 if (!list_empty(&pagelist)) { 613 if (!list_empty(&pagelist))
607 err = migrate_pages_to(&pagelist, NULL, dest); 614 err = migrate_pages(&pagelist, new_node_page, dest);
608 if (!list_empty(&pagelist)) 615
609 putback_lru_pages(&pagelist);
610 }
611 return err; 616 return err;
612} 617}
613 618
@@ -627,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm,
627 632
628 down_read(&mm->mmap_sem); 633 down_read(&mm->mmap_sem);
629 634
635 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
636 if (err)
637 goto out;
638
630/* 639/*
631 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 640 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
632 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 641 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
@@ -686,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm,
686 if (err < 0) 695 if (err < 0)
687 break; 696 break;
688 } 697 }
689 698out:
690 up_read(&mm->mmap_sem); 699 up_read(&mm->mmap_sem);
691 if (err < 0) 700 if (err < 0)
692 return err; 701 return err;
@@ -694,6 +703,12 @@ int do_migrate_pages(struct mm_struct *mm,
694 703
695} 704}
696 705
706static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
707{
708 struct vm_area_struct *vma = (struct vm_area_struct *)private;
709
710 return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
711}
697#else 712#else
698 713
699static void migrate_page_add(struct page *page, struct list_head *pagelist, 714static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -706,6 +721,11 @@ int do_migrate_pages(struct mm_struct *mm,
706{ 721{
707 return -ENOSYS; 722 return -ENOSYS;
708} 723}
724
725static struct page *new_vma_page(struct page *page, unsigned long private)
726{
727 return NULL;
728}
709#endif 729#endif
710 730
711long do_mbind(unsigned long start, unsigned long len, 731long do_mbind(unsigned long start, unsigned long len,
@@ -767,15 +787,13 @@ long do_mbind(unsigned long start, unsigned long len,
767 err = mbind_range(vma, start, end, new); 787 err = mbind_range(vma, start, end, new);
768 788
769 if (!list_empty(&pagelist)) 789 if (!list_empty(&pagelist))
770 nr_failed = migrate_pages_to(&pagelist, vma, -1); 790 nr_failed = migrate_pages(&pagelist, new_vma_page,
791 (unsigned long)vma);
771 792
772 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 793 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
773 err = -EIO; 794 err = -EIO;
774 } 795 }
775 796
776 if (!list_empty(&pagelist))
777 putback_lru_pages(&pagelist);
778
779 up_write(&mm->mmap_sem); 797 up_write(&mm->mmap_sem);
780 mpol_free(new); 798 mpol_free(new);
781 return err; 799 return err;
@@ -929,6 +947,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
929 goto out; 947 goto out;
930 } 948 }
931 949
950 err = security_task_movememory(task);
951 if (err)
952 goto out;
953
932 err = do_migrate_pages(mm, &old, &new, 954 err = do_migrate_pages(mm, &old, &new,
933 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 955 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
934out: 956out:
@@ -1187,10 +1209,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1187 1209
1188 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 1210 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1189 page = __alloc_pages(gfp, order, zl); 1211 page = __alloc_pages(gfp, order, zl);
1190 if (page && page_zone(page) == zl->zones[0]) { 1212 if (page && page_zone(page) == zl->zones[0])
1191 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; 1213 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1192 put_cpu();
1193 }
1194 return page; 1214 return page;
1195} 1215}
1196 1216
@@ -1799,7 +1819,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
1799 1819
1800int show_numa_map(struct seq_file *m, void *v) 1820int show_numa_map(struct seq_file *m, void *v)
1801{ 1821{
1802 struct task_struct *task = m->private; 1822 struct proc_maps_private *priv = m->private;
1803 struct vm_area_struct *vma = v; 1823 struct vm_area_struct *vma = v;
1804 struct numa_maps *md; 1824 struct numa_maps *md;
1805 struct file *file = vma->vm_file; 1825 struct file *file = vma->vm_file;
@@ -1815,7 +1835,7 @@ int show_numa_map(struct seq_file *m, void *v)
1815 return 0; 1835 return 0;
1816 1836
1817 mpol_to_str(buffer, sizeof(buffer), 1837 mpol_to_str(buffer, sizeof(buffer),
1818 get_vma_policy(task, vma, vma->vm_start)); 1838 get_vma_policy(priv->task, vma, vma->vm_start));
1819 1839
1820 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1840 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1821 1841
@@ -1869,7 +1889,7 @@ out:
1869 kfree(md); 1889 kfree(md);
1870 1890
1871 if (m->count < m->size) 1891 if (m->count < m->size)
1872 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 1892 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1873 return 0; 1893 return 0;
1874} 1894}
1875 1895
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c25040693d2..3f1e0c2c942c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
15#include <linux/migrate.h> 15#include <linux/migrate.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/swapops.h>
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
20#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
@@ -23,13 +24,13 @@
23#include <linux/topology.h> 24#include <linux/topology.h>
24#include <linux/cpu.h> 25#include <linux/cpu.h>
25#include <linux/cpuset.h> 26#include <linux/cpuset.h>
26#include <linux/swapops.h> 27#include <linux/writeback.h>
28#include <linux/mempolicy.h>
29#include <linux/vmalloc.h>
30#include <linux/security.h>
27 31
28#include "internal.h" 32#include "internal.h"
29 33
30/* The maximum number of pages to take off the LRU for migration */
31#define MIGRATE_CHUNK_SIZE 256
32
33#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 34#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
34 35
35/* 36/*
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
64} 65}
65 66
66/* 67/*
67 * migrate_prep() needs to be called after we have compiled the list of pages 68 * migrate_prep() needs to be called before we start compiling a list of pages
68 * to be migrated using isolate_lru_page() but before we begin a series of calls 69 * to be migrated using isolate_lru_page().
69 * to migrate_pages().
70 */ 70 */
71int migrate_prep(void) 71int migrate_prep(void)
72{ 72{
73 /* Must have swap device for migration */
74 if (nr_swap_pages <= 0)
75 return -ENODEV;
76
77 /* 73 /*
78 * Clear the LRU lists so pages can be isolated. 74 * Clear the LRU lists so pages can be isolated.
79 * Note that pages may be moved off the LRU after we have 75 * Note that pages may be moved off the LRU after we have
@@ -87,7 +83,6 @@ int migrate_prep(void)
87 83
88static inline void move_to_lru(struct page *page) 84static inline void move_to_lru(struct page *page)
89{ 85{
90 list_del(&page->lru);
91 if (PageActive(page)) { 86 if (PageActive(page)) {
92 /* 87 /*
93 * lru_cache_add_active checks that 88 * lru_cache_add_active checks that
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l)
113 int count = 0; 108 int count = 0;
114 109
115 list_for_each_entry_safe(page, page2, l, lru) { 110 list_for_each_entry_safe(page, page2, l, lru) {
111 list_del(&page->lru);
116 move_to_lru(page); 112 move_to_lru(page);
117 count++; 113 count++;
118 } 114 }
119 return count; 115 return count;
120} 116}
121 117
122/* 118static inline int is_swap_pte(pte_t pte)
123 * Non migratable page
124 */
125int fail_migrate_page(struct page *newpage, struct page *page)
126{ 119{
127 return -EIO; 120 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
128} 121}
129EXPORT_SYMBOL(fail_migrate_page);
130 122
131/* 123/*
132 * swapout a single page 124 * Restore a potential migration pte to a working pte entry
133 * page is locked upon entry, unlocked on exit
134 */ 125 */
135static int swap_page(struct page *page) 126static void remove_migration_pte(struct vm_area_struct *vma,
127 struct page *old, struct page *new)
136{ 128{
137 struct address_space *mapping = page_mapping(page); 129 struct mm_struct *mm = vma->vm_mm;
130 swp_entry_t entry;
131 pgd_t *pgd;
132 pud_t *pud;
133 pmd_t *pmd;
134 pte_t *ptep, pte;
135 spinlock_t *ptl;
136 unsigned long addr = page_address_in_vma(new, vma);
137
138 if (addr == -EFAULT)
139 return;
140
141 pgd = pgd_offset(mm, addr);
142 if (!pgd_present(*pgd))
143 return;
144
145 pud = pud_offset(pgd, addr);
146 if (!pud_present(*pud))
147 return;
148
149 pmd = pmd_offset(pud, addr);
150 if (!pmd_present(*pmd))
151 return;
152
153 ptep = pte_offset_map(pmd, addr);
154
155 if (!is_swap_pte(*ptep)) {
156 pte_unmap(ptep);
157 return;
158 }
138 159
139 if (page_mapped(page) && mapping) 160 ptl = pte_lockptr(mm, pmd);
140 if (try_to_unmap(page, 1) != SWAP_SUCCESS) 161 spin_lock(ptl);
141 goto unlock_retry; 162 pte = *ptep;
163 if (!is_swap_pte(pte))
164 goto out;
142 165
143 if (PageDirty(page)) { 166 entry = pte_to_swp_entry(pte);
144 /* Page is dirty, try to write it out here */
145 switch(pageout(page, mapping)) {
146 case PAGE_KEEP:
147 case PAGE_ACTIVATE:
148 goto unlock_retry;
149 167
150 case PAGE_SUCCESS: 168 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
151 goto retry; 169 goto out;
152 170
153 case PAGE_CLEAN: 171 get_page(new);
154 ; /* try to free the page below */ 172 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
155 } 173 if (is_write_migration_entry(entry))
156 } 174 pte = pte_mkwrite(pte);
175 set_pte_at(mm, addr, ptep, pte);
157 176
158 if (PagePrivate(page)) { 177 if (PageAnon(new))
159 if (!try_to_release_page(page, GFP_KERNEL) || 178 page_add_anon_rmap(new, vma, addr);
160 (!mapping && page_count(page) == 1)) 179 else
161 goto unlock_retry; 180 page_add_file_rmap(new);
162 }
163 181
164 if (remove_mapping(mapping, page)) { 182 /* No need to invalidate - it was non-present before */
165 /* Success */ 183 update_mmu_cache(vma, addr, pte);
166 unlock_page(page); 184 lazy_mmu_prot_update(pte);
167 return 0;
168 }
169 185
170unlock_retry: 186out:
171 unlock_page(page); 187 pte_unmap_unlock(ptep, ptl);
188}
172 189
173retry: 190/*
174 return -EAGAIN; 191 * Note that remove_file_migration_ptes will only work on regular mappings,
192 * Nonlinear mappings do not use migration entries.
193 */
194static void remove_file_migration_ptes(struct page *old, struct page *new)
195{
196 struct vm_area_struct *vma;
197 struct address_space *mapping = page_mapping(new);
198 struct prio_tree_iter iter;
199 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
200
201 if (!mapping)
202 return;
203
204 spin_lock(&mapping->i_mmap_lock);
205
206 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
207 remove_migration_pte(vma, old, new);
208
209 spin_unlock(&mapping->i_mmap_lock);
175} 210}
176 211
177/* 212/*
178 * Remove references for a page and establish the new page with the correct 213 * Must hold mmap_sem lock on at least one of the vmas containing
179 * basic settings to be able to stop accesses to the page. 214 * the page so that the anon_vma cannot vanish.
180 */ 215 */
181int migrate_page_remove_references(struct page *newpage, 216static void remove_anon_migration_ptes(struct page *old, struct page *new)
182 struct page *page, int nr_refs)
183{ 217{
184 struct address_space *mapping = page_mapping(page); 218 struct anon_vma *anon_vma;
185 struct page **radix_pointer; 219 struct vm_area_struct *vma;
220 unsigned long mapping;
186 221
187 /* 222 mapping = (unsigned long)new->mapping;
188 * Avoid doing any of the following work if the page count
189 * indicates that the page is in use or truncate has removed
190 * the page.
191 */
192 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
193 return -EAGAIN;
194 223
195 /* 224 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
196 * Establish swap ptes for anonymous pages or destroy pte 225 return;
197 * maps for files.
198 *
199 * In order to reestablish file backed mappings the fault handlers
200 * will take the radix tree_lock which may then be used to stop
201 * processses from accessing this page until the new page is ready.
202 *
203 * A process accessing via a swap pte (an anonymous page) will take a
204 * page_lock on the old page which will block the process until the
205 * migration attempt is complete. At that time the PageSwapCache bit
206 * will be examined. If the page was migrated then the PageSwapCache
207 * bit will be clear and the operation to retrieve the page will be
208 * retried which will find the new page in the radix tree. Then a new
209 * direct mapping may be generated based on the radix tree contents.
210 *
211 * If the page was not migrated then the PageSwapCache bit
212 * is still set and the operation may continue.
213 */
214 if (try_to_unmap(page, 1) == SWAP_FAIL)
215 /* A vma has VM_LOCKED set -> permanent failure */
216 return -EPERM;
217 226
218 /* 227 /*
219 * Give up if we were unable to remove all mappings. 228 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
220 */ 229 */
221 if (page_mapcount(page)) 230 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
222 return -EAGAIN; 231 spin_lock(&anon_vma->lock);
232
233 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
234 remove_migration_pte(vma, old, new);
235
236 spin_unlock(&anon_vma->lock);
237}
238
239/*
240 * Get rid of all migration entries and replace them by
241 * references to the indicated page.
242 */
243static void remove_migration_ptes(struct page *old, struct page *new)
244{
245 if (PageAnon(new))
246 remove_anon_migration_ptes(old, new);
247 else
248 remove_file_migration_ptes(old, new);
249}
250
251/*
252 * Something used the pte of a page under migration. We need to
253 * get to the page and wait until migration is finished.
254 * When we return from this function the fault will be retried.
255 *
256 * This function is called from do_swap_page().
257 */
258void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
259 unsigned long address)
260{
261 pte_t *ptep, pte;
262 spinlock_t *ptl;
263 swp_entry_t entry;
264 struct page *page;
265
266 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
267 pte = *ptep;
268 if (!is_swap_pte(pte))
269 goto out;
270
271 entry = pte_to_swp_entry(pte);
272 if (!is_migration_entry(entry))
273 goto out;
274
275 page = migration_entry_to_page(entry);
276
277 get_page(page);
278 pte_unmap_unlock(ptep, ptl);
279 wait_on_page_locked(page);
280 put_page(page);
281 return;
282out:
283 pte_unmap_unlock(ptep, ptl);
284}
285
286/*
287 * Replace the page in the mapping.
288 *
289 * The number of remaining references must be:
290 * 1 for anonymous pages without a mapping
291 * 2 for pages with a mapping
292 * 3 for pages with a mapping and PagePrivate set.
293 */
294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page)
296{
297 struct page **radix_pointer;
298
299 if (!mapping) {
300 /* Anonymous page */
301 if (page_count(page) != 1)
302 return -EAGAIN;
303 return 0;
304 }
223 305
224 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
225 307
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage,
227 &mapping->page_tree, 309 &mapping->page_tree,
228 page_index(page)); 310 page_index(page));
229 311
230 if (!page_mapping(page) || page_count(page) != nr_refs || 312 if (page_count(page) != 2 + !!PagePrivate(page) ||
231 *radix_pointer != page) { 313 *radix_pointer != page) {
232 write_unlock_irq(&mapping->tree_lock); 314 write_unlock_irq(&mapping->tree_lock);
233 return -EAGAIN; 315 return -EAGAIN;
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage,
235 317
236 /* 318 /*
237 * Now we know that no one else is looking at the page. 319 * Now we know that no one else is looking at the page.
238 *
239 * Certain minimal information about a page must be available
240 * in order for other subsystems to properly handle the page if they
241 * find it through the radix tree update before we are finished
242 * copying the page.
243 */ 320 */
244 get_page(newpage); 321 get_page(newpage);
245 newpage->index = page->index; 322#ifdef CONFIG_SWAP
246 newpage->mapping = page->mapping;
247 if (PageSwapCache(page)) { 323 if (PageSwapCache(page)) {
248 SetPageSwapCache(newpage); 324 SetPageSwapCache(newpage);
249 set_page_private(newpage, page_private(page)); 325 set_page_private(newpage, page_private(page));
250 } 326 }
327#endif
251 328
252 *radix_pointer = newpage; 329 *radix_pointer = newpage;
253 __put_page(page); 330 __put_page(page);
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage,
255 332
256 return 0; 333 return 0;
257} 334}
258EXPORT_SYMBOL(migrate_page_remove_references);
259 335
260/* 336/*
261 * Copy the page to its new location 337 * Copy the page to its new location
262 */ 338 */
263void migrate_page_copy(struct page *newpage, struct page *page) 339static void migrate_page_copy(struct page *newpage, struct page *page)
264{ 340{
265 copy_highpage(newpage, page); 341 copy_highpage(newpage, page);
266 342
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
282 set_page_dirty(newpage); 358 set_page_dirty(newpage);
283 } 359 }
284 360
361#ifdef CONFIG_SWAP
285 ClearPageSwapCache(page); 362 ClearPageSwapCache(page);
363#endif
286 ClearPageActive(page); 364 ClearPageActive(page);
287 ClearPagePrivate(page); 365 ClearPagePrivate(page);
288 set_page_private(page, 0); 366 set_page_private(page, 0);
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page)
295 if (PageWriteback(newpage)) 373 if (PageWriteback(newpage))
296 end_page_writeback(newpage); 374 end_page_writeback(newpage);
297} 375}
298EXPORT_SYMBOL(migrate_page_copy); 376
377/************************************************************
378 * Migration functions
379 ***********************************************************/
380
381/* Always fail migration. Used for mappings that are not movable */
382int fail_migrate_page(struct address_space *mapping,
383 struct page *newpage, struct page *page)
384{
385 return -EIO;
386}
387EXPORT_SYMBOL(fail_migrate_page);
299 388
300/* 389/*
301 * Common logic to directly migrate a single page suitable for 390 * Common logic to directly migrate a single page suitable for
@@ -303,51 +392,284 @@ EXPORT_SYMBOL(migrate_page_copy);
303 * 392 *
304 * Pages are locked upon entry and exit. 393 * Pages are locked upon entry and exit.
305 */ 394 */
306int migrate_page(struct page *newpage, struct page *page) 395int migrate_page(struct address_space *mapping,
396 struct page *newpage, struct page *page)
307{ 397{
308 int rc; 398 int rc;
309 399
310 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 400 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
311 401
312 rc = migrate_page_remove_references(newpage, page, 2); 402 rc = migrate_page_move_mapping(mapping, newpage, page);
403
404 if (rc)
405 return rc;
406
407 migrate_page_copy(newpage, page);
408 return 0;
409}
410EXPORT_SYMBOL(migrate_page);
411
412/*
413 * Migration function for pages with buffers. This function can only be used
414 * if the underlying filesystem guarantees that no other references to "page"
415 * exist.
416 */
417int buffer_migrate_page(struct address_space *mapping,
418 struct page *newpage, struct page *page)
419{
420 struct buffer_head *bh, *head;
421 int rc;
422
423 if (!page_has_buffers(page))
424 return migrate_page(mapping, newpage, page);
425
426 head = page_buffers(page);
427
428 rc = migrate_page_move_mapping(mapping, newpage, page);
313 429
314 if (rc) 430 if (rc)
315 return rc; 431 return rc;
316 432
433 bh = head;
434 do {
435 get_bh(bh);
436 lock_buffer(bh);
437 bh = bh->b_this_page;
438
439 } while (bh != head);
440
441 ClearPagePrivate(page);
442 set_page_private(newpage, page_private(page));
443 set_page_private(page, 0);
444 put_page(page);
445 get_page(newpage);
446
447 bh = head;
448 do {
449 set_bh_page(bh, newpage, bh_offset(bh));
450 bh = bh->b_this_page;
451
452 } while (bh != head);
453
454 SetPagePrivate(newpage);
455
317 migrate_page_copy(newpage, page); 456 migrate_page_copy(newpage, page);
318 457
458 bh = head;
459 do {
460 unlock_buffer(bh);
461 put_bh(bh);
462 bh = bh->b_this_page;
463
464 } while (bh != head);
465
466 return 0;
467}
468EXPORT_SYMBOL(buffer_migrate_page);
469
470/*
471 * Writeback a page to clean the dirty state
472 */
473static int writeout(struct address_space *mapping, struct page *page)
474{
475 struct writeback_control wbc = {
476 .sync_mode = WB_SYNC_NONE,
477 .nr_to_write = 1,
478 .range_start = 0,
479 .range_end = LLONG_MAX,
480 .nonblocking = 1,
481 .for_reclaim = 1
482 };
483 int rc;
484
485 if (!mapping->a_ops->writepage)
486 /* No write method for the address space */
487 return -EINVAL;
488
489 if (!clear_page_dirty_for_io(page))
490 /* Someone else already triggered a write */
491 return -EAGAIN;
492
319 /* 493 /*
320 * Remove auxiliary swap entries and replace 494 * A dirty page may imply that the underlying filesystem has
321 * them with real ptes. 495 * the page on some queue. So the page must be clean for
322 * 496 * migration. Writeout may mean we loose the lock and the
323 * Note that a real pte entry will allow processes that are not 497 * page state is no longer what we checked for earlier.
324 * waiting on the page lock to use the new page via the page tables 498 * At this point we know that the migration attempt cannot
325 * before the new page is unlocked. 499 * be successful.
326 */ 500 */
327 remove_from_swap(newpage); 501 remove_migration_ptes(page, page);
328 return 0; 502
503 rc = mapping->a_ops->writepage(page, &wbc);
504 if (rc < 0)
505 /* I/O Error writing */
506 return -EIO;
507
508 if (rc != AOP_WRITEPAGE_ACTIVATE)
509 /* unlocked. Relock */
510 lock_page(page);
511
512 return -EAGAIN;
513}
514
515/*
516 * Default handling if a filesystem does not provide a migration function.
517 */
518static int fallback_migrate_page(struct address_space *mapping,
519 struct page *newpage, struct page *page)
520{
521 if (PageDirty(page))
522 return writeout(mapping, page);
523
524 /*
525 * Buffers may be managed in a filesystem specific way.
526 * We must have no buffers or drop them.
527 */
528 if (page_has_buffers(page) &&
529 !try_to_release_page(page, GFP_KERNEL))
530 return -EAGAIN;
531
532 return migrate_page(mapping, newpage, page);
533}
534
535/*
536 * Move a page to a newly allocated page
537 * The page is locked and all ptes have been successfully removed.
538 *
539 * The new page will have replaced the old page if this function
540 * is successful.
541 */
542static int move_to_new_page(struct page *newpage, struct page *page)
543{
544 struct address_space *mapping;
545 int rc;
546
547 /*
548 * Block others from accessing the page when we get around to
549 * establishing additional references. We are the only one
550 * holding a reference to the new page at this point.
551 */
552 if (TestSetPageLocked(newpage))
553 BUG();
554
555 /* Prepare mapping for the new page.*/
556 newpage->index = page->index;
557 newpage->mapping = page->mapping;
558
559 mapping = page_mapping(page);
560 if (!mapping)
561 rc = migrate_page(mapping, newpage, page);
562 else if (mapping->a_ops->migratepage)
563 /*
564 * Most pages have a mapping and most filesystems
565 * should provide a migration function. Anonymous
566 * pages are part of swap space which also has its
567 * own migration function. This is the most common
568 * path for page migration.
569 */
570 rc = mapping->a_ops->migratepage(mapping,
571 newpage, page);
572 else
573 rc = fallback_migrate_page(mapping, newpage, page);
574
575 if (!rc)
576 remove_migration_ptes(page, newpage);
577 else
578 newpage->mapping = NULL;
579
580 unlock_page(newpage);
581
582 return rc;
583}
584
585/*
586 * Obtain the lock on page, remove all ptes and migrate the page
587 * to the newly allocated page in newpage.
588 */
589static int unmap_and_move(new_page_t get_new_page, unsigned long private,
590 struct page *page, int force)
591{
592 int rc = 0;
593 int *result = NULL;
594 struct page *newpage = get_new_page(page, private, &result);
595
596 if (!newpage)
597 return -ENOMEM;
598
599 if (page_count(page) == 1)
600 /* page was freed from under us. So we are done. */
601 goto move_newpage;
602
603 rc = -EAGAIN;
604 if (TestSetPageLocked(page)) {
605 if (!force)
606 goto move_newpage;
607 lock_page(page);
608 }
609
610 if (PageWriteback(page)) {
611 if (!force)
612 goto unlock;
613 wait_on_page_writeback(page);
614 }
615
616 /*
617 * Establish migration ptes or remove ptes
618 */
619 try_to_unmap(page, 1);
620 if (!page_mapped(page))
621 rc = move_to_new_page(newpage, page);
622
623 if (rc)
624 remove_migration_ptes(page, page);
625
626unlock:
627 unlock_page(page);
628
629 if (rc != -EAGAIN) {
630 /*
631 * A page that has been migrated has all references
632 * removed and will be freed. A page that has not been
633 * migrated will have kepts its references and be
634 * restored.
635 */
636 list_del(&page->lru);
637 move_to_lru(page);
638 }
639
640move_newpage:
641 /*
642 * Move the new page to the LRU. If migration was not successful
643 * then this will free the page.
644 */
645 move_to_lru(newpage);
646 if (result) {
647 if (rc)
648 *result = rc;
649 else
650 *result = page_to_nid(newpage);
651 }
652 return rc;
329} 653}
330EXPORT_SYMBOL(migrate_page);
331 654
332/* 655/*
333 * migrate_pages 656 * migrate_pages
334 * 657 *
335 * Two lists are passed to this function. The first list 658 * The function takes one list of pages to migrate and a function
336 * contains the pages isolated from the LRU to be migrated. 659 * that determines from the page to be migrated and the private data
337 * The second list contains new pages that the pages isolated 660 * the target of the move and allocates the page.
338 * can be moved to. If the second list is NULL then all
339 * pages are swapped out.
340 * 661 *
341 * The function returns after 10 attempts or if no pages 662 * The function returns after 10 attempts or if no pages
342 * are movable anymore because to has become empty 663 * are movable anymore because to has become empty
343 * or no retryable pages exist anymore. 664 * or no retryable pages exist anymore. All pages will be
665 * retruned to the LRU or freed.
344 * 666 *
345 * Return: Number of pages not migrated when "to" ran empty. 667 * Return: Number of pages not migrated or error code.
346 */ 668 */
347int migrate_pages(struct list_head *from, struct list_head *to, 669int migrate_pages(struct list_head *from,
348 struct list_head *moved, struct list_head *failed) 670 new_page_t get_new_page, unsigned long private)
349{ 671{
350 int retry; 672 int retry = 1;
351 int nr_failed = 0; 673 int nr_failed = 0;
352 int pass = 0; 674 int pass = 0;
353 struct page *page; 675 struct page *page;
@@ -358,305 +680,317 @@ int migrate_pages(struct list_head *from, struct list_head *to,
358 if (!swapwrite) 680 if (!swapwrite)
359 current->flags |= PF_SWAPWRITE; 681 current->flags |= PF_SWAPWRITE;
360 682
361redo: 683 for(pass = 0; pass < 10 && retry; pass++) {
362 retry = 0; 684 retry = 0;
685
686 list_for_each_entry_safe(page, page2, from, lru) {
687 cond_resched();
688
689 rc = unmap_and_move(get_new_page, private,
690 page, pass > 2);
691
692 switch(rc) {
693 case -ENOMEM:
694 goto out;
695 case -EAGAIN:
696 retry++;
697 break;
698 case 0:
699 break;
700 default:
701 /* Permanent failure */
702 nr_failed++;
703 break;
704 }
705 }
706 }
707 rc = 0;
708out:
709 if (!swapwrite)
710 current->flags &= ~PF_SWAPWRITE;
711
712 putback_lru_pages(from);
713
714 if (rc)
715 return rc;
363 716
364 list_for_each_entry_safe(page, page2, from, lru) { 717 return nr_failed + retry;
365 struct page *newpage = NULL; 718}
366 struct address_space *mapping;
367 719
368 cond_resched(); 720#ifdef CONFIG_NUMA
721/*
722 * Move a list of individual pages
723 */
724struct page_to_node {
725 unsigned long addr;
726 struct page *page;
727 int node;
728 int status;
729};
369 730
370 rc = 0; 731static struct page *new_page_node(struct page *p, unsigned long private,
371 if (page_count(page) == 1) 732 int **result)
372 /* page was freed from under us. So we are done. */ 733{
373 goto next; 734 struct page_to_node *pm = (struct page_to_node *)private;
374 735
375 if (to && list_empty(to)) 736 while (pm->node != MAX_NUMNODES && pm->page != p)
376 break; 737 pm++;
377 738
378 /* 739 if (pm->node == MAX_NUMNODES)
379 * Skip locked pages during the first two passes to give the 740 return NULL;
380 * functions holding the lock time to release the page. Later we
381 * use lock_page() to have a higher chance of acquiring the
382 * lock.
383 */
384 rc = -EAGAIN;
385 if (pass > 2)
386 lock_page(page);
387 else
388 if (TestSetPageLocked(page))
389 goto next;
390 741
391 /* 742 *result = &pm->status;
392 * Only wait on writeback if we have already done a pass where
393 * we we may have triggered writeouts for lots of pages.
394 */
395 if (pass > 0) {
396 wait_on_page_writeback(page);
397 } else {
398 if (PageWriteback(page))
399 goto unlock_page;
400 }
401 743
402 /* 744 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
403 * Anonymous pages must have swap cache references otherwise 745}
404 * the information contained in the page maps cannot be
405 * preserved.
406 */
407 if (PageAnon(page) && !PageSwapCache(page)) {
408 if (!add_to_swap(page, GFP_KERNEL)) {
409 rc = -ENOMEM;
410 goto unlock_page;
411 }
412 }
413 746
414 if (!to) { 747/*
415 rc = swap_page(page); 748 * Move a set of pages as indicated in the pm array. The addr
416 goto next; 749 * field must be set to the virtual address of the page to be moved
417 } 750 * and the node number must contain a valid target node.
751 */
752static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
753 int migrate_all)
754{
755 int err;
756 struct page_to_node *pp;
757 LIST_HEAD(pagelist);
758
759 down_read(&mm->mmap_sem);
418 760
419 newpage = lru_to_page(to); 761 /*
420 lock_page(newpage); 762 * Build a list of pages to migrate
763 */
764 migrate_prep();
765 for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
766 struct vm_area_struct *vma;
767 struct page *page;
421 768
422 /* 769 /*
423 * Pages are properly locked and writeback is complete. 770 * A valid page pointer that will not match any of the
424 * Try to migrate the page. 771 * pages that will be moved.
425 */ 772 */
426 mapping = page_mapping(page); 773 pp->page = ZERO_PAGE(0);
427 if (!mapping)
428 goto unlock_both;
429 774
430 if (mapping->a_ops->migratepage) { 775 err = -EFAULT;
431 /* 776 vma = find_vma(mm, pp->addr);
432 * Most pages have a mapping and most filesystems 777 if (!vma)
433 * should provide a migration function. Anonymous 778 goto set_status;
434 * pages are part of swap space which also has its
435 * own migration function. This is the most common
436 * path for page migration.
437 */
438 rc = mapping->a_ops->migratepage(newpage, page);
439 goto unlock_both;
440 }
441
442 /* Make sure the dirty bit is up to date */
443 if (try_to_unmap(page, 1) == SWAP_FAIL) {
444 rc = -EPERM;
445 goto unlock_both;
446 }
447 779
448 if (page_mapcount(page)) { 780 page = follow_page(vma, pp->addr, FOLL_GET);
449 rc = -EAGAIN; 781 err = -ENOENT;
450 goto unlock_both; 782 if (!page)
451 } 783 goto set_status;
452 784
453 /* 785 if (PageReserved(page)) /* Check for zero page */
454 * Default handling if a filesystem does not provide 786 goto put_and_set;
455 * a migration function. We can only migrate clean
456 * pages so try to write out any dirty pages first.
457 */
458 if (PageDirty(page)) {
459 switch (pageout(page, mapping)) {
460 case PAGE_KEEP:
461 case PAGE_ACTIVATE:
462 goto unlock_both;
463
464 case PAGE_SUCCESS:
465 unlock_page(newpage);
466 goto next;
467
468 case PAGE_CLEAN:
469 ; /* try to migrate the page below */
470 }
471 }
472 787
473 /* 788 pp->page = page;
474 * Buffers are managed in a filesystem specific way. 789 err = page_to_nid(page);
475 * We must have no buffers or drop them.
476 */
477 if (!page_has_buffers(page) ||
478 try_to_release_page(page, GFP_KERNEL)) {
479 rc = migrate_page(newpage, page);
480 goto unlock_both;
481 }
482 790
483 /* 791 if (err == pp->node)
484 * On early passes with mapped pages simply
485 * retry. There may be a lock held for some
486 * buffers that may go away. Later
487 * swap them out.
488 */
489 if (pass > 4) {
490 /* 792 /*
491 * Persistently unable to drop buffers..... As a 793 * Node already in the right place
492 * measure of last resort we fall back to
493 * swap_page().
494 */ 794 */
495 unlock_page(newpage); 795 goto put_and_set;
496 newpage = NULL;
497 rc = swap_page(page);
498 goto next;
499 }
500 796
501unlock_both: 797 err = -EACCES;
502 unlock_page(newpage); 798 if (page_mapcount(page) > 1 &&
503 799 !migrate_all)
504unlock_page: 800 goto put_and_set;
505 unlock_page(page); 801
506 802 err = isolate_lru_page(page, &pagelist);
507next: 803put_and_set:
508 if (rc == -EAGAIN) { 804 /*
509 retry++; 805 * Either remove the duplicate refcount from
510 } else if (rc) { 806 * isolate_lru_page() or drop the page ref if it was
511 /* Permanent failure */ 807 * not isolated.
512 list_move(&page->lru, failed); 808 */
513 nr_failed++; 809 put_page(page);
514 } else { 810set_status:
515 if (newpage) { 811 pp->status = err;
516 /* Successful migration. Return page to LRU */
517 move_to_lru(newpage);
518 }
519 list_move(&page->lru, moved);
520 }
521 } 812 }
522 if (retry && pass++ < 10)
523 goto redo;
524 813
525 if (!swapwrite) 814 if (!list_empty(&pagelist))
526 current->flags &= ~PF_SWAPWRITE; 815 err = migrate_pages(&pagelist, new_page_node,
816 (unsigned long)pm);
817 else
818 err = -ENOENT;
527 819
528 return nr_failed + retry; 820 up_read(&mm->mmap_sem);
821 return err;
529} 822}
530 823
531/* 824/*
532 * Migration function for pages with buffers. This function can only be used 825 * Determine the nodes of a list of pages. The addr in the pm array
533 * if the underlying filesystem guarantees that no other references to "page" 826 * must have been set to the virtual address of which we want to determine
534 * exist. 827 * the node number.
535 */ 828 */
536int buffer_migrate_page(struct page *newpage, struct page *page) 829static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
537{ 830{
538 struct address_space *mapping = page->mapping; 831 down_read(&mm->mmap_sem);
539 struct buffer_head *bh, *head; 832
540 int rc; 833 for ( ; pm->node != MAX_NUMNODES; pm++) {
834 struct vm_area_struct *vma;
835 struct page *page;
836 int err;
837
838 err = -EFAULT;
839 vma = find_vma(mm, pm->addr);
840 if (!vma)
841 goto set_status;
842
843 page = follow_page(vma, pm->addr, 0);
844 err = -ENOENT;
845 /* Use PageReserved to check for zero page */
846 if (!page || PageReserved(page))
847 goto set_status;
848
849 err = page_to_nid(page);
850set_status:
851 pm->status = err;
852 }
541 853
542 if (!mapping) 854 up_read(&mm->mmap_sem);
543 return -EAGAIN; 855 return 0;
856}
544 857
545 if (!page_has_buffers(page)) 858/*
546 return migrate_page(newpage, page); 859 * Move a list of pages in the address space of the currently executing
860 * process.
861 */
862asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
863 const void __user * __user *pages,
864 const int __user *nodes,
865 int __user *status, int flags)
866{
867 int err = 0;
868 int i;
869 struct task_struct *task;
870 nodemask_t task_nodes;
871 struct mm_struct *mm;
872 struct page_to_node *pm = NULL;
547 873
548 head = page_buffers(page); 874 /* Check flags */
875 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
876 return -EINVAL;
549 877
550 rc = migrate_page_remove_references(newpage, page, 3); 878 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
879 return -EPERM;
551 880
552 if (rc) 881 /* Find the mm_struct */
553 return rc; 882 read_lock(&tasklist_lock);
883 task = pid ? find_task_by_pid(pid) : current;
884 if (!task) {
885 read_unlock(&tasklist_lock);
886 return -ESRCH;
887 }
888 mm = get_task_mm(task);
889 read_unlock(&tasklist_lock);
554 890
555 bh = head; 891 if (!mm)
556 do { 892 return -EINVAL;
557 get_bh(bh);
558 lock_buffer(bh);
559 bh = bh->b_this_page;
560 893
561 } while (bh != head); 894 /*
895 * Check if this process has the right to modify the specified
896 * process. The right exists if the process has administrative
897 * capabilities, superuser privileges or the same
898 * userid as the target process.
899 */
900 if ((current->euid != task->suid) && (current->euid != task->uid) &&
901 (current->uid != task->suid) && (current->uid != task->uid) &&
902 !capable(CAP_SYS_NICE)) {
903 err = -EPERM;
904 goto out2;
905 }
562 906
563 ClearPagePrivate(page); 907 err = security_task_movememory(task);
564 set_page_private(newpage, page_private(page)); 908 if (err)
565 set_page_private(page, 0); 909 goto out2;
566 put_page(page);
567 get_page(newpage);
568 910
569 bh = head;
570 do {
571 set_bh_page(bh, newpage, bh_offset(bh));
572 bh = bh->b_this_page;
573 911
574 } while (bh != head); 912 task_nodes = cpuset_mems_allowed(task);
575 913
576 SetPagePrivate(newpage); 914 /* Limit nr_pages so that the multiplication may not overflow */
915 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
916 err = -E2BIG;
917 goto out2;
918 }
577 919
578 migrate_page_copy(newpage, page); 920 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
921 if (!pm) {
922 err = -ENOMEM;
923 goto out2;
924 }
579 925
580 bh = head; 926 /*
581 do { 927 * Get parameters from user space and initialize the pm
582 unlock_buffer(bh); 928 * array. Return various errors if the user did something wrong.
583 put_bh(bh); 929 */
584 bh = bh->b_this_page; 930 for (i = 0; i < nr_pages; i++) {
931 const void *p;
585 932
586 } while (bh != head); 933 err = -EFAULT;
934 if (get_user(p, pages + i))
935 goto out;
587 936
588 return 0; 937 pm[i].addr = (unsigned long)p;
589} 938 if (nodes) {
590EXPORT_SYMBOL(buffer_migrate_page); 939 int node;
591 940
592/* 941 if (get_user(node, nodes + i))
593 * Migrate the list 'pagelist' of pages to a certain destination. 942 goto out;
594 *
595 * Specify destination with either non-NULL vma or dest_node >= 0
596 * Return the number of pages not migrated or error code
597 */
598int migrate_pages_to(struct list_head *pagelist,
599 struct vm_area_struct *vma, int dest)
600{
601 LIST_HEAD(newlist);
602 LIST_HEAD(moved);
603 LIST_HEAD(failed);
604 int err = 0;
605 unsigned long offset = 0;
606 int nr_pages;
607 struct page *page;
608 struct list_head *p;
609 943
610redo: 944 err = -ENODEV;
611 nr_pages = 0; 945 if (!node_online(node))
612 list_for_each(p, pagelist) { 946 goto out;
613 if (vma) {
614 /*
615 * The address passed to alloc_page_vma is used to
616 * generate the proper interleave behavior. We fake
617 * the address here by an increasing offset in order
618 * to get the proper distribution of pages.
619 *
620 * No decision has been made as to which page
621 * a certain old page is moved to so we cannot
622 * specify the correct address.
623 */
624 page = alloc_page_vma(GFP_HIGHUSER, vma,
625 offset + vma->vm_start);
626 offset += PAGE_SIZE;
627 }
628 else
629 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
630 947
631 if (!page) { 948 err = -EACCES;
632 err = -ENOMEM; 949 if (!node_isset(node, task_nodes))
633 goto out; 950 goto out;
951
952 pm[i].node = node;
634 } 953 }
635 list_add_tail(&page->lru, &newlist);
636 nr_pages++;
637 if (nr_pages > MIGRATE_CHUNK_SIZE)
638 break;
639 } 954 }
640 err = migrate_pages(pagelist, &newlist, &moved, &failed); 955 /* End marker */
956 pm[nr_pages].node = MAX_NUMNODES;
957
958 if (nodes)
959 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
960 else
961 err = do_pages_stat(mm, pm);
641 962
642 putback_lru_pages(&moved); /* Call release pages instead ?? */ 963 if (err >= 0)
964 /* Return status information */
965 for (i = 0; i < nr_pages; i++)
966 if (put_user(pm[i].status, status + i))
967 err = -EFAULT;
643 968
644 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
645 goto redo;
646out: 969out:
647 /* Return leftover allocated pages */ 970 vfree(pm);
648 while (!list_empty(&newlist)) { 971out2:
649 page = list_entry(newlist.next, struct page, lru); 972 mmput(mm);
650 list_del(&page->lru); 973 return err;
651 __free_page(page); 974}
652 } 975#endif
653 list_splice(&failed, pagelist); 976
654 if (err < 0) 977/*
655 return err; 978 * Call migration functions in the vma_ops that may prepare
656 979 * memory in a vm for migration. migration functions may perform
657 /* Calculate number of leftover pages */ 980 * the migration for vmas that do not have an underlying page struct.
658 nr_pages = 0; 981 */
659 list_for_each(p, pagelist) 982int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
660 nr_pages++; 983 const nodemask_t *from, unsigned long flags)
661 return nr_pages; 984{
985 struct vm_area_struct *vma;
986 int err = 0;
987
988 for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
989 if (vma->vm_ops && vma->vm_ops->migrate) {
990 err = vma->vm_ops->migrate(vma, to, from, flags);
991 if (err)
992 break;
993 }
994 }
995 return err;
662} 996}
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..c1868ecdbc5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -96,7 +96,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
96 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 96 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
97 unsigned long n; 97 unsigned long n;
98 98
99 free = get_page_cache_size(); 99 free = global_page_state(NR_FILE_PAGES);
100 free += nr_swap_pages; 100 free += nr_swap_pages;
101 101
102 /* 102 /*
@@ -1065,7 +1065,8 @@ munmap_back:
1065 vma->vm_start = addr; 1065 vma->vm_start = addr;
1066 vma->vm_end = addr + len; 1066 vma->vm_end = addr + len;
1067 vma->vm_flags = vm_flags; 1067 vma->vm_flags = vm_flags;
1068 vma->vm_page_prot = protection_map[vm_flags & 0x0f]; 1068 vma->vm_page_prot = protection_map[vm_flags &
1069 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1069 vma->vm_pgoff = pgoff; 1070 vma->vm_pgoff = pgoff;
1070 1071
1071 if (file) { 1072 if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
1089 goto free_vma; 1090 goto free_vma;
1090 } 1091 }
1091 1092
1093 /* Don't make the VMA automatically writable if it's shared, but the
1094 * backer wishes to know when pages are first written to */
1095 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1096 vma->vm_page_prot =
1097 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1098
1092 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1099 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1093 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1100 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1094 * that memory reservation must be checked; but that reservation 1101 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1921 vma->vm_end = addr + len; 1928 vma->vm_end = addr + len;
1922 vma->vm_pgoff = pgoff; 1929 vma->vm_pgoff = pgoff;
1923 vma->vm_flags = flags; 1930 vma->vm_flags = flags;
1924 vma->vm_page_prot = protection_map[flags & 0x0f]; 1931 vma->vm_page_prot = protection_map[flags &
1932 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1925 vma_link(mm, vma, prev, rb_link, rb_parent); 1933 vma_link(mm, vma, prev, rb_link, rb_parent);
1926out: 1934out:
1927 mm->total_vm += len >> PAGE_SHIFT; 1935 mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index b022370e612e..0959ee1a4795 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -5,7 +5,6 @@
5 */ 5 */
6 6
7 7
8#include <linux/config.h>
9#include <linux/stddef.h> 8#include <linux/stddef.h>
10#include <linux/mmzone.h> 9#include <linux/mmzone.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d4289b61..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
19#include <linux/mempolicy.h> 19#include <linux/mempolicy.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22 22#include <linux/swap.h>
23#include <linux/swapops.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
28static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
29 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot)
30{ 31{
31 pte_t *pte; 32 pte_t *pte, oldpte;
32 spinlock_t *ptl; 33 spinlock_t *ptl;
33 34
34 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 35 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
35 do { 36 do {
36 if (pte_present(*pte)) { 37 oldpte = *pte;
38 if (pte_present(oldpte)) {
37 pte_t ptent; 39 pte_t ptent;
38 40
39 /* Avoid an SMP race with hardware updated dirty/clean 41 /* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
44 set_pte_at(mm, addr, pte, ptent); 46 set_pte_at(mm, addr, pte, ptent);
45 lazy_mmu_prot_update(ptent); 47 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION
49 } else if (!pte_file(oldpte)) {
50 swp_entry_t entry = pte_to_swp_entry(oldpte);
51
52 if (is_write_migration_entry(entry)) {
53 /*
54 * A protection check is difficult so
55 * just be safe and disable write
56 */
57 make_migration_entry_read(&entry);
58 set_pte_at(mm, addr, pte,
59 swp_entry_to_pte(entry));
60 }
61#endif
46 } 62 }
63
47 } while (pte++, addr += PAGE_SIZE, addr != end); 64 } while (pte++, addr += PAGE_SIZE, addr != end);
48 pte_unmap_unlock(pte - 1, ptl); 65 pte_unmap_unlock(pte - 1, ptl);
49} 66}
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
106 unsigned long oldflags = vma->vm_flags; 123 unsigned long oldflags = vma->vm_flags;
107 long nrpages = (end - start) >> PAGE_SHIFT; 124 long nrpages = (end - start) >> PAGE_SHIFT;
108 unsigned long charged = 0; 125 unsigned long charged = 0;
126 unsigned int mask;
109 pgprot_t newprot; 127 pgprot_t newprot;
110 pgoff_t pgoff; 128 pgoff_t pgoff;
111 int error; 129 int error;
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
132 } 150 }
133 } 151 }
134 152
135 newprot = protection_map[newflags & 0xf];
136
137 /* 153 /*
138 * First try to merge with previous and/or next vma. 154 * First try to merge with previous and/or next vma.
139 */ 155 */
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
160 } 176 }
161 177
162success: 178success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
163 /* 187 /*
164 * vm_flags and vm_page_prot are protected by the mmap_sem 188 * vm_flags and vm_page_prot are protected by the mmap_sem
165 * held in write mode. 189 * held in write mode.
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
205 /* 229 /*
206 * Does the application expect PROT_READ to imply PROT_EXEC: 230 * Does the application expect PROT_READ to imply PROT_EXEC:
207 */ 231 */
208 if (unlikely((prot & PROT_READ) && 232 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
209 (current->personality & READ_IMPLIES_EXEC)))
210 prot |= PROT_EXEC; 233 prot |= PROT_EXEC;
211 234
212 vm_flags = calc_vm_prot_bits(prot); 235 vm_flags = calc_vm_prot_bits(prot);
diff --git a/mm/msync.c b/mm/msync.c
index bc6c95376366..d083544df21b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
170 * just ignore them, but return -ENOMEM at the end. 170 * just ignore them, but return -ENOMEM at the end.
171 */ 171 */
172 down_read(&current->mm->mmap_sem); 172 down_read(&current->mm->mmap_sem);
173 if (flags & MS_SYNC)
174 current->flags |= PF_SYNCWRITE;
175 vma = find_vma(current->mm, start); 173 vma = find_vma(current->mm, start);
176 if (!vma) { 174 if (!vma) {
177 error = -ENOMEM; 175 error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
228 } 226 }
229 } while (vma && !done); 227 } while (vma && !done);
230out_unlock: 228out_unlock:
231 current->flags &= ~PF_SYNCWRITE;
232 up_read(&current->mm->mmap_sem); 229 up_read(&current->mm->mmap_sem);
233out: 230out:
234 return error; 231 return error;
diff --git a/mm/nommu.c b/mm/nommu.c
index 029fadac0fb5..5151c44a8257 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1122,7 +1122,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1122 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1122 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1123 unsigned long n; 1123 unsigned long n;
1124 1124
1125 free = get_page_cache_size(); 1125 free = global_page_state(NR_FILE_PAGES);
1126 free += nr_swap_pages; 1126 free += nr_swap_pages;
1127 1127
1128 /* 1128 /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3ee..d46ed0f1dc06 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,10 +22,11 @@
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24 24
25int sysctl_panic_on_oom;
25/* #define DEBUG */ 26/* #define DEBUG */
26 27
27/** 28/**
28 * oom_badness - calculate a numeric value for how bad this task has been 29 * badness - calculate a numeric value for how bad this task has been
29 * @p: task struct of which task we should calculate 30 * @p: task struct of which task we should calculate
30 * @uptime: current uptime in seconds 31 * @uptime: current uptime in seconds
31 * 32 *
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
200 continue; 201 continue;
201 202
202 /* 203 /*
203 * This is in the process of releasing memory so for wait it 204 * This is in the process of releasing memory so wait for it
204 * to finish before killing some other task by mistake. 205 * to finish before killing some other task by mistake.
205 */ 206 */
206 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 207 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
306} 307}
307 308
308/** 309/**
309 * oom_kill - kill the "best" process when we run out of memory 310 * out_of_memory - kill the "best" process when we run out of memory
310 * 311 *
311 * If we run out of memory, we have the choice between either 312 * If we run out of memory, we have the choice between either
312 * killing a random task (bad), letting the system crash (worse) 313 * killing a random task (bad), letting the system crash (worse)
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
344 break; 345 break;
345 346
346 case CONSTRAINT_NONE: 347 case CONSTRAINT_NONE:
348 if (sysctl_panic_on_oom)
349 panic("out of memory. panic_on_oom is selected\n");
347retry: 350retry:
348 /* 351 /*
349 * Rambo mode: Shoot down a process and hope it solves whatever 352 * Rambo mode: Shoot down a process and hope it solves whatever
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79bb..e630188ccc40 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -99,22 +99,6 @@ EXPORT_SYMBOL(laptop_mode);
99 99
100static void background_writeout(unsigned long _min_pages); 100static void background_writeout(unsigned long _min_pages);
101 101
102struct writeback_state
103{
104 unsigned long nr_dirty;
105 unsigned long nr_unstable;
106 unsigned long nr_mapped;
107 unsigned long nr_writeback;
108};
109
110static void get_writeback_state(struct writeback_state *wbs)
111{
112 wbs->nr_dirty = read_page_state(nr_dirty);
113 wbs->nr_unstable = read_page_state(nr_unstable);
114 wbs->nr_mapped = read_page_state(nr_mapped);
115 wbs->nr_writeback = read_page_state(nr_writeback);
116}
117
118/* 102/*
119 * Work out the current dirty-memory clamping and background writeout 103 * Work out the current dirty-memory clamping and background writeout
120 * thresholds. 104 * thresholds.
@@ -133,8 +117,8 @@ static void get_writeback_state(struct writeback_state *wbs)
133 * clamping level. 117 * clamping level.
134 */ 118 */
135static void 119static void
136get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, 120get_dirty_limits(long *pbackground, long *pdirty,
137 struct address_space *mapping) 121 struct address_space *mapping)
138{ 122{
139 int background_ratio; /* Percentages */ 123 int background_ratio; /* Percentages */
140 int dirty_ratio; 124 int dirty_ratio;
@@ -144,8 +128,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
144 unsigned long available_memory = total_pages; 128 unsigned long available_memory = total_pages;
145 struct task_struct *tsk; 129 struct task_struct *tsk;
146 130
147 get_writeback_state(wbs);
148
149#ifdef CONFIG_HIGHMEM 131#ifdef CONFIG_HIGHMEM
150 /* 132 /*
151 * If this mapping can only allocate from low memory, 133 * If this mapping can only allocate from low memory,
@@ -156,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
156#endif 138#endif
157 139
158 140
159 unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; 141 unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
142 global_page_state(NR_ANON_PAGES)) * 100) /
143 total_pages;
160 144
161 dirty_ratio = vm_dirty_ratio; 145 dirty_ratio = vm_dirty_ratio;
162 if (dirty_ratio > unmapped_ratio / 2) 146 if (dirty_ratio > unmapped_ratio / 2)
@@ -189,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
189 */ 173 */
190static void balance_dirty_pages(struct address_space *mapping) 174static void balance_dirty_pages(struct address_space *mapping)
191{ 175{
192 struct writeback_state wbs;
193 long nr_reclaimable; 176 long nr_reclaimable;
194 long background_thresh; 177 long background_thresh;
195 long dirty_thresh; 178 long dirty_thresh;
@@ -204,13 +187,15 @@ static void balance_dirty_pages(struct address_space *mapping)
204 .sync_mode = WB_SYNC_NONE, 187 .sync_mode = WB_SYNC_NONE,
205 .older_than_this = NULL, 188 .older_than_this = NULL,
206 .nr_to_write = write_chunk, 189 .nr_to_write = write_chunk,
190 .range_cyclic = 1,
207 }; 191 };
208 192
209 get_dirty_limits(&wbs, &background_thresh, 193 get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
210 &dirty_thresh, mapping); 194 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
211 nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; 195 global_page_state(NR_UNSTABLE_NFS);
212 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 196 if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
213 break; 197 dirty_thresh)
198 break;
214 199
215 if (!dirty_exceeded) 200 if (!dirty_exceeded)
216 dirty_exceeded = 1; 201 dirty_exceeded = 1;
@@ -223,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping)
223 */ 208 */
224 if (nr_reclaimable) { 209 if (nr_reclaimable) {
225 writeback_inodes(&wbc); 210 writeback_inodes(&wbc);
226 get_dirty_limits(&wbs, &background_thresh, 211 get_dirty_limits(&background_thresh,
227 &dirty_thresh, mapping); 212 &dirty_thresh, mapping);
228 nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; 213 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
229 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 214 global_page_state(NR_UNSTABLE_NFS);
230 break; 215 if (nr_reclaimable +
216 global_page_state(NR_WRITEBACK)
217 <= dirty_thresh)
218 break;
231 pages_written += write_chunk - wbc.nr_to_write; 219 pages_written += write_chunk - wbc.nr_to_write;
232 if (pages_written >= write_chunk) 220 if (pages_written >= write_chunk)
233 break; /* We've done our duty */ 221 break; /* We've done our duty */
@@ -235,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping)
235 blk_congestion_wait(WRITE, HZ/10); 223 blk_congestion_wait(WRITE, HZ/10);
236 } 224 }
237 225
238 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) 226 if (nr_reclaimable + global_page_state(NR_WRITEBACK)
239 dirty_exceeded = 0; 227 <= dirty_thresh && dirty_exceeded)
228 dirty_exceeded = 0;
240 229
241 if (writeback_in_progress(bdi)) 230 if (writeback_in_progress(bdi))
242 return; /* pdflush is already working this queue */ 231 return; /* pdflush is already working this queue */
@@ -298,12 +287,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
298 287
299void throttle_vm_writeout(void) 288void throttle_vm_writeout(void)
300{ 289{
301 struct writeback_state wbs;
302 long background_thresh; 290 long background_thresh;
303 long dirty_thresh; 291 long dirty_thresh;
304 292
305 for ( ; ; ) { 293 for ( ; ; ) {
306 get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); 294 get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
307 295
308 /* 296 /*
309 * Boost the allowable dirty threshold a bit for page 297 * Boost the allowable dirty threshold a bit for page
@@ -311,8 +299,9 @@ void throttle_vm_writeout(void)
311 */ 299 */
312 dirty_thresh += dirty_thresh / 10; /* wheeee... */ 300 dirty_thresh += dirty_thresh / 10; /* wheeee... */
313 301
314 if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) 302 if (global_page_state(NR_UNSTABLE_NFS) +
315 break; 303 global_page_state(NR_WRITEBACK) <= dirty_thresh)
304 break;
316 blk_congestion_wait(WRITE, HZ/10); 305 blk_congestion_wait(WRITE, HZ/10);
317 } 306 }
318} 307}
@@ -331,15 +320,16 @@ static void background_writeout(unsigned long _min_pages)
331 .older_than_this = NULL, 320 .older_than_this = NULL,
332 .nr_to_write = 0, 321 .nr_to_write = 0,
333 .nonblocking = 1, 322 .nonblocking = 1,
323 .range_cyclic = 1,
334 }; 324 };
335 325
336 for ( ; ; ) { 326 for ( ; ; ) {
337 struct writeback_state wbs;
338 long background_thresh; 327 long background_thresh;
339 long dirty_thresh; 328 long dirty_thresh;
340 329
341 get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); 330 get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
342 if (wbs.nr_dirty + wbs.nr_unstable < background_thresh 331 if (global_page_state(NR_FILE_DIRTY) +
332 global_page_state(NR_UNSTABLE_NFS) < background_thresh
343 && min_pages <= 0) 333 && min_pages <= 0)
344 break; 334 break;
345 wbc.encountered_congestion = 0; 335 wbc.encountered_congestion = 0;
@@ -363,12 +353,9 @@ static void background_writeout(unsigned long _min_pages)
363 */ 353 */
364int wakeup_pdflush(long nr_pages) 354int wakeup_pdflush(long nr_pages)
365{ 355{
366 if (nr_pages == 0) { 356 if (nr_pages == 0)
367 struct writeback_state wbs; 357 nr_pages = global_page_state(NR_FILE_DIRTY) +
368 358 global_page_state(NR_UNSTABLE_NFS);
369 get_writeback_state(&wbs);
370 nr_pages = wbs.nr_dirty + wbs.nr_unstable;
371 }
372 return pdflush_operation(background_writeout, nr_pages); 359 return pdflush_operation(background_writeout, nr_pages);
373} 360}
374 361
@@ -399,7 +386,6 @@ static void wb_kupdate(unsigned long arg)
399 unsigned long start_jif; 386 unsigned long start_jif;
400 unsigned long next_jif; 387 unsigned long next_jif;
401 long nr_to_write; 388 long nr_to_write;
402 struct writeback_state wbs;
403 struct writeback_control wbc = { 389 struct writeback_control wbc = {
404 .bdi = NULL, 390 .bdi = NULL,
405 .sync_mode = WB_SYNC_NONE, 391 .sync_mode = WB_SYNC_NONE,
@@ -407,15 +393,16 @@ static void wb_kupdate(unsigned long arg)
407 .nr_to_write = 0, 393 .nr_to_write = 0,
408 .nonblocking = 1, 394 .nonblocking = 1,
409 .for_kupdate = 1, 395 .for_kupdate = 1,
396 .range_cyclic = 1,
410 }; 397 };
411 398
412 sync_supers(); 399 sync_supers();
413 400
414 get_writeback_state(&wbs);
415 oldest_jif = jiffies - dirty_expire_interval; 401 oldest_jif = jiffies - dirty_expire_interval;
416 start_jif = jiffies; 402 start_jif = jiffies;
417 next_jif = start_jif + dirty_writeback_interval; 403 next_jif = start_jif + dirty_writeback_interval;
418 nr_to_write = wbs.nr_dirty + wbs.nr_unstable + 404 nr_to_write = global_page_state(NR_FILE_DIRTY) +
405 global_page_state(NR_UNSTABLE_NFS) +
419 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 406 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
420 while (nr_to_write > 0) { 407 while (nr_to_write > 0) {
421 wbc.encountered_congestion = 0; 408 wbc.encountered_congestion = 0;
@@ -513,14 +500,14 @@ static void set_ratelimit(void)
513 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; 500 ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
514} 501}
515 502
516static int 503static int __cpuinit
517ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) 504ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
518{ 505{
519 set_ratelimit(); 506 set_ratelimit();
520 return 0; 507 return 0;
521} 508}
522 509
523static struct notifier_block ratelimit_nb = { 510static struct notifier_block __cpuinitdata ratelimit_nb = {
524 .notifier_call = ratelimit_handler, 511 .notifier_call = ratelimit_handler,
525 .next = NULL, 512 .next = NULL,
526}; 513};
@@ -637,7 +624,8 @@ int __set_page_dirty_nobuffers(struct page *page)
637 if (mapping2) { /* Race with truncate? */ 624 if (mapping2) { /* Race with truncate? */
638 BUG_ON(mapping2 != mapping); 625 BUG_ON(mapping2 != mapping);
639 if (mapping_cap_account_dirty(mapping)) 626 if (mapping_cap_account_dirty(mapping))
640 inc_page_state(nr_dirty); 627 __inc_zone_page_state(page,
628 NR_FILE_DIRTY);
641 radix_tree_tag_set(&mapping->page_tree, 629 radix_tree_tag_set(&mapping->page_tree,
642 page_index(page), PAGECACHE_TAG_DIRTY); 630 page_index(page), PAGECACHE_TAG_DIRTY);
643 } 631 }
@@ -724,9 +712,9 @@ int test_clear_page_dirty(struct page *page)
724 radix_tree_tag_clear(&mapping->page_tree, 712 radix_tree_tag_clear(&mapping->page_tree,
725 page_index(page), 713 page_index(page),
726 PAGECACHE_TAG_DIRTY); 714 PAGECACHE_TAG_DIRTY);
727 write_unlock_irqrestore(&mapping->tree_lock, flags);
728 if (mapping_cap_account_dirty(mapping)) 715 if (mapping_cap_account_dirty(mapping))
729 dec_page_state(nr_dirty); 716 __dec_zone_page_state(page, NR_FILE_DIRTY);
717 write_unlock_irqrestore(&mapping->tree_lock, flags);
730 return 1; 718 return 1;
731 } 719 }
732 write_unlock_irqrestore(&mapping->tree_lock, flags); 720 write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -757,7 +745,7 @@ int clear_page_dirty_for_io(struct page *page)
757 if (mapping) { 745 if (mapping) {
758 if (TestClearPageDirty(page)) { 746 if (TestClearPageDirty(page)) {
759 if (mapping_cap_account_dirty(mapping)) 747 if (mapping_cap_account_dirty(mapping))
760 dec_page_state(nr_dirty); 748 dec_zone_page_state(page, NR_FILE_DIRTY);
761 return 1; 749 return 1;
762 } 750 }
763 return 0; 751 return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..3e792a583f3b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -14,7 +14,6 @@
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17#include <linux/config.h>
18#include <linux/stddef.h> 17#include <linux/stddef.h>
19#include <linux/mm.h> 18#include <linux/mm.h>
20#include <linux/swap.h> 19#include <linux/swap.h>
@@ -37,6 +36,7 @@
37#include <linux/nodemask.h> 36#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
39#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h>
40 40
41#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
42#include <asm/div64.h> 42#include <asm/div64.h>
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table);
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
84int min_free_kbytes = 1024; 84int min_free_kbytes = 1024;
85 85
86unsigned long __initdata nr_kernel_pages; 86unsigned long __meminitdata nr_kernel_pages;
87unsigned long __initdata nr_all_pages; 87unsigned long __meminitdata nr_all_pages;
88 88
89#ifdef CONFIG_DEBUG_VM 89#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 90static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -265,7 +265,7 @@ static inline void rmv_page_order(struct page *page)
265 * satisfies the following equation: 265 * satisfies the following equation:
266 * P = B & ~(1 << O) 266 * P = B & ~(1 << O)
267 * 267 *
268 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 268 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
269 */ 269 */
270static inline struct page * 270static inline struct page *
271__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 271__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
286 * we can do coalesce a page and its buddy if 286 * we can do coalesce a page and its buddy if
287 * (a) the buddy is not in a hole && 287 * (a) the buddy is not in a hole &&
288 * (b) the buddy is in the buddy system && 288 * (b) the buddy is in the buddy system &&
289 * (c) a page and its buddy have the same order. 289 * (c) a page and its buddy have the same order &&
290 * (d) a page and its buddy are in the same zone.
290 * 291 *
291 * For recording whether a page is in the buddy system, we use PG_buddy. 292 * For recording whether a page is in the buddy system, we use PG_buddy.
292 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 293 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
293 * 294 *
294 * For recording page's order, we use page_private(page). 295 * For recording page's order, we use page_private(page).
295 */ 296 */
296static inline int page_is_buddy(struct page *page, int order) 297static inline int page_is_buddy(struct page *page, struct page *buddy,
298 int order)
297{ 299{
298#ifdef CONFIG_HOLES_IN_ZONE 300#ifdef CONFIG_HOLES_IN_ZONE
299 if (!pfn_valid(page_to_pfn(page))) 301 if (!pfn_valid(page_to_pfn(buddy)))
300 return 0; 302 return 0;
301#endif 303#endif
302 304
303 if (PageBuddy(page) && page_order(page) == order) { 305 if (page_zone_id(page) != page_zone_id(buddy))
304 BUG_ON(page_count(page) != 0); 306 return 0;
307
308 if (PageBuddy(buddy) && page_order(buddy) == order) {
309 BUG_ON(page_count(buddy) != 0);
305 return 1; 310 return 1;
306 } 311 }
307 return 0; 312 return 0;
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page,
352 struct page *buddy; 357 struct page *buddy;
353 358
354 buddy = __page_find_buddy(page, page_idx, order); 359 buddy = __page_find_buddy(page, page_idx, order);
355 if (!page_is_buddy(buddy, order)) 360 if (!page_is_buddy(page, buddy, order))
356 break; /* Move the buddy up one level. */ 361 break; /* Move the buddy up one level. */
357 362
358 list_del(&buddy->lru); 363 list_del(&buddy->lru);
@@ -440,8 +445,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
440 445
441 arch_free_page(page, order); 446 arch_free_page(page, order);
442 if (!PageHighMem(page)) 447 if (!PageHighMem(page))
443 mutex_debug_check_no_locks_freed(page_address(page), 448 debug_check_no_locks_freed(page_address(page),
444 PAGE_SIZE<<order); 449 PAGE_SIZE<<order);
445 450
446 for (i = 0 ; i < (1 << order) ; ++i) 451 for (i = 0 ; i < (1 << order) ; ++i)
447 reserved += free_pages_check(page + i); 452 reserved += free_pages_check(page + i);
@@ -450,7 +455,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
450 455
451 kernel_map_pages(page, 1 << order, 0); 456 kernel_map_pages(page, 1 << order, 0);
452 local_irq_save(flags); 457 local_irq_save(flags);
453 __mod_page_state(pgfree, 1 << order); 458 __count_vm_events(PGFREE, 1 << order);
454 free_one_page(page_zone(page), page, order); 459 free_one_page(page_zone(page), page, order);
455 local_irq_restore(flags); 460 local_irq_restore(flags);
456} 461}
@@ -703,27 +708,6 @@ void drain_local_pages(void)
703} 708}
704#endif /* CONFIG_PM */ 709#endif /* CONFIG_PM */
705 710
706static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
707{
708#ifdef CONFIG_NUMA
709 pg_data_t *pg = z->zone_pgdat;
710 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
711 struct per_cpu_pageset *p;
712
713 p = zone_pcp(z, cpu);
714 if (pg == orig) {
715 p->numa_hit++;
716 } else {
717 p->numa_miss++;
718 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
719 }
720 if (pg == NODE_DATA(numa_node_id()))
721 p->local_node++;
722 else
723 p->other_node++;
724#endif
725}
726
727/* 711/*
728 * Free a 0-order page 712 * Free a 0-order page
729 */ 713 */
@@ -744,7 +728,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
744 728
745 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 729 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
746 local_irq_save(flags); 730 local_irq_save(flags);
747 __inc_page_state(pgfree); 731 __count_vm_event(PGFREE);
748 list_add(&page->lru, &pcp->list); 732 list_add(&page->lru, &pcp->list);
749 pcp->count++; 733 pcp->count++;
750 if (pcp->count >= pcp->high) { 734 if (pcp->count >= pcp->high) {
@@ -820,8 +804,8 @@ again:
820 goto failed; 804 goto failed;
821 } 805 }
822 806
823 __mod_page_state_zone(zone, pgalloc, 1 << order); 807 __count_zone_vm_events(PGALLOC, zone, 1 << order);
824 zone_statistics(zonelist, zone, cpu); 808 zone_statistics(zonelist, zone);
825 local_irq_restore(flags); 809 local_irq_restore(flags);
826 put_cpu(); 810 put_cpu();
827 811
@@ -951,8 +935,7 @@ restart:
951 goto got_pg; 935 goto got_pg;
952 936
953 do { 937 do {
954 if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) 938 wakeup_kswapd(*z, order);
955 wakeup_kswapd(*z, order);
956 } while (*(++z)); 939 } while (*(++z));
957 940
958 /* 941 /*
@@ -1226,141 +1209,6 @@ static void show_node(struct zone *zone)
1226#define show_node(zone) do { } while (0) 1209#define show_node(zone) do { } while (0)
1227#endif 1210#endif
1228 1211
1229/*
1230 * Accumulate the page_state information across all CPUs.
1231 * The result is unavoidably approximate - it can change
1232 * during and after execution of this function.
1233 */
1234static DEFINE_PER_CPU(struct page_state, page_states) = {0};
1235
1236atomic_t nr_pagecache = ATOMIC_INIT(0);
1237EXPORT_SYMBOL(nr_pagecache);
1238#ifdef CONFIG_SMP
1239DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1240#endif
1241
1242static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1243{
1244 unsigned cpu;
1245
1246 memset(ret, 0, nr * sizeof(unsigned long));
1247 cpus_and(*cpumask, *cpumask, cpu_online_map);
1248
1249 for_each_cpu_mask(cpu, *cpumask) {
1250 unsigned long *in;
1251 unsigned long *out;
1252 unsigned off;
1253 unsigned next_cpu;
1254
1255 in = (unsigned long *)&per_cpu(page_states, cpu);
1256
1257 next_cpu = next_cpu(cpu, *cpumask);
1258 if (likely(next_cpu < NR_CPUS))
1259 prefetch(&per_cpu(page_states, next_cpu));
1260
1261 out = (unsigned long *)ret;
1262 for (off = 0; off < nr; off++)
1263 *out++ += *in++;
1264 }
1265}
1266
1267void get_page_state_node(struct page_state *ret, int node)
1268{
1269 int nr;
1270 cpumask_t mask = node_to_cpumask(node);
1271
1272 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1273 nr /= sizeof(unsigned long);
1274
1275 __get_page_state(ret, nr+1, &mask);
1276}
1277
1278void get_page_state(struct page_state *ret)
1279{
1280 int nr;
1281 cpumask_t mask = CPU_MASK_ALL;
1282
1283 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1284 nr /= sizeof(unsigned long);
1285
1286 __get_page_state(ret, nr + 1, &mask);
1287}
1288
1289void get_full_page_state(struct page_state *ret)
1290{
1291 cpumask_t mask = CPU_MASK_ALL;
1292
1293 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1294}
1295
1296unsigned long read_page_state_offset(unsigned long offset)
1297{
1298 unsigned long ret = 0;
1299 int cpu;
1300
1301 for_each_online_cpu(cpu) {
1302 unsigned long in;
1303
1304 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
1305 ret += *((unsigned long *)in);
1306 }
1307 return ret;
1308}
1309
1310void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1311{
1312 void *ptr;
1313
1314 ptr = &__get_cpu_var(page_states);
1315 *(unsigned long *)(ptr + offset) += delta;
1316}
1317EXPORT_SYMBOL(__mod_page_state_offset);
1318
1319void mod_page_state_offset(unsigned long offset, unsigned long delta)
1320{
1321 unsigned long flags;
1322 void *ptr;
1323
1324 local_irq_save(flags);
1325 ptr = &__get_cpu_var(page_states);
1326 *(unsigned long *)(ptr + offset) += delta;
1327 local_irq_restore(flags);
1328}
1329EXPORT_SYMBOL(mod_page_state_offset);
1330
1331void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1332 unsigned long *free, struct pglist_data *pgdat)
1333{
1334 struct zone *zones = pgdat->node_zones;
1335 int i;
1336
1337 *active = 0;
1338 *inactive = 0;
1339 *free = 0;
1340 for (i = 0; i < MAX_NR_ZONES; i++) {
1341 *active += zones[i].nr_active;
1342 *inactive += zones[i].nr_inactive;
1343 *free += zones[i].free_pages;
1344 }
1345}
1346
1347void get_zone_counts(unsigned long *active,
1348 unsigned long *inactive, unsigned long *free)
1349{
1350 struct pglist_data *pgdat;
1351
1352 *active = 0;
1353 *inactive = 0;
1354 *free = 0;
1355 for_each_online_pgdat(pgdat) {
1356 unsigned long l, m, n;
1357 __get_zone_counts(&l, &m, &n, pgdat);
1358 *active += l;
1359 *inactive += m;
1360 *free += n;
1361 }
1362}
1363
1364void si_meminfo(struct sysinfo *val) 1212void si_meminfo(struct sysinfo *val)
1365{ 1213{
1366 val->totalram = totalram_pages; 1214 val->totalram = totalram_pages;
@@ -1401,7 +1249,6 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1401 */ 1249 */
1402void show_free_areas(void) 1250void show_free_areas(void)
1403{ 1251{
1404 struct page_state ps;
1405 int cpu, temperature; 1252 int cpu, temperature;
1406 unsigned long active; 1253 unsigned long active;
1407 unsigned long inactive; 1254 unsigned long inactive;
@@ -1433,7 +1280,6 @@ void show_free_areas(void)
1433 } 1280 }
1434 } 1281 }
1435 1282
1436 get_page_state(&ps);
1437 get_zone_counts(&active, &inactive, &free); 1283 get_zone_counts(&active, &inactive, &free);
1438 1284
1439 printk("Free pages: %11ukB (%ukB HighMem)\n", 1285 printk("Free pages: %11ukB (%ukB HighMem)\n",
@@ -1444,13 +1290,13 @@ void show_free_areas(void)
1444 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1445 active, 1291 active,
1446 inactive, 1292 inactive,
1447 ps.nr_dirty, 1293 global_page_state(NR_FILE_DIRTY),
1448 ps.nr_writeback, 1294 global_page_state(NR_WRITEBACK),
1449 ps.nr_unstable, 1295 global_page_state(NR_UNSTABLE_NFS),
1450 nr_free_pages(), 1296 nr_free_pages(),
1451 ps.nr_slab, 1297 global_page_state(NR_SLAB),
1452 ps.nr_mapped, 1298 global_page_state(NR_FILE_MAPPED),
1453 ps.nr_page_table_pages); 1299 global_page_state(NR_PAGETABLE));
1454 1300
1455 for_each_zone(zone) { 1301 for_each_zone(zone) {
1456 int i; 1302 int i;
@@ -1485,7 +1331,7 @@ void show_free_areas(void)
1485 } 1331 }
1486 1332
1487 for_each_zone(zone) { 1333 for_each_zone(zone) {
1488 unsigned long nr, flags, order, total = 0; 1334 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1489 1335
1490 show_node(zone); 1336 show_node(zone);
1491 printk("%s: ", zone->name); 1337 printk("%s: ", zone->name);
@@ -1496,11 +1342,12 @@ void show_free_areas(void)
1496 1342
1497 spin_lock_irqsave(&zone->lock, flags); 1343 spin_lock_irqsave(&zone->lock, flags);
1498 for (order = 0; order < MAX_ORDER; order++) { 1344 for (order = 0; order < MAX_ORDER; order++) {
1499 nr = zone->free_area[order].nr_free; 1345 nr[order] = zone->free_area[order].nr_free;
1500 total += nr << order; 1346 total += nr[order] << order;
1501 printk("%lu*%lukB ", nr, K(1UL) << order);
1502 } 1347 }
1503 spin_unlock_irqrestore(&zone->lock, flags); 1348 spin_unlock_irqrestore(&zone->lock, flags);
1349 for (order = 0; order < MAX_ORDER; order++)
1350 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1504 printk("= %lukB\n", K(total)); 1351 printk("= %lukB\n", K(total));
1505 } 1352 }
1506 1353
@@ -1512,7 +1359,7 @@ void show_free_areas(void)
1512 * 1359 *
1513 * Add all populated zones of a node to the zonelist. 1360 * Add all populated zones of a node to the zonelist.
1514 */ 1361 */
1515static int __init build_zonelists_node(pg_data_t *pgdat, 1362static int __meminit build_zonelists_node(pg_data_t *pgdat,
1516 struct zonelist *zonelist, int nr_zones, int zone_type) 1363 struct zonelist *zonelist, int nr_zones, int zone_type)
1517{ 1364{
1518 struct zone *zone; 1365 struct zone *zone;
@@ -1548,7 +1395,7 @@ static inline int highest_zone(int zone_bits)
1548 1395
1549#ifdef CONFIG_NUMA 1396#ifdef CONFIG_NUMA
1550#define MAX_NODE_LOAD (num_online_nodes()) 1397#define MAX_NODE_LOAD (num_online_nodes())
1551static int __initdata node_load[MAX_NUMNODES]; 1398static int __meminitdata node_load[MAX_NUMNODES];
1552/** 1399/**
1553 * find_next_best_node - find the next node that should appear in a given node's fallback list 1400 * find_next_best_node - find the next node that should appear in a given node's fallback list
1554 * @node: node whose fallback list we're appending 1401 * @node: node whose fallback list we're appending
@@ -1563,7 +1410,7 @@ static int __initdata node_load[MAX_NUMNODES];
1563 * on them otherwise. 1410 * on them otherwise.
1564 * It returns -1 if no node is found. 1411 * It returns -1 if no node is found.
1565 */ 1412 */
1566static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1413static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1567{ 1414{
1568 int n, val; 1415 int n, val;
1569 int min_val = INT_MAX; 1416 int min_val = INT_MAX;
@@ -1609,7 +1456,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1609 return best_node; 1456 return best_node;
1610} 1457}
1611 1458
1612static void __init build_zonelists(pg_data_t *pgdat) 1459static void __meminit build_zonelists(pg_data_t *pgdat)
1613{ 1460{
1614 int i, j, k, node, local_node; 1461 int i, j, k, node, local_node;
1615 int prev_node, load; 1462 int prev_node, load;
@@ -1661,7 +1508,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
1661 1508
1662#else /* CONFIG_NUMA */ 1509#else /* CONFIG_NUMA */
1663 1510
1664static void __init build_zonelists(pg_data_t *pgdat) 1511static void __meminit build_zonelists(pg_data_t *pgdat)
1665{ 1512{
1666 int i, j, k, node, local_node; 1513 int i, j, k, node, local_node;
1667 1514
@@ -1699,14 +1546,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
1699 1546
1700#endif /* CONFIG_NUMA */ 1547#endif /* CONFIG_NUMA */
1701 1548
1702void __init build_all_zonelists(void) 1549/* return values int ....just for stop_machine_run() */
1550static int __meminit __build_all_zonelists(void *dummy)
1703{ 1551{
1704 int i; 1552 int nid;
1553 for_each_online_node(nid)
1554 build_zonelists(NODE_DATA(nid));
1555 return 0;
1556}
1705 1557
1706 for_each_online_node(i) 1558void __meminit build_all_zonelists(void)
1707 build_zonelists(NODE_DATA(i)); 1559{
1708 printk("Built %i zonelists\n", num_online_nodes()); 1560 if (system_state == SYSTEM_BOOTING) {
1709 cpuset_init_current_mems_allowed(); 1561 __build_all_zonelists(0);
1562 cpuset_init_current_mems_allowed();
1563 } else {
1564 /* we have to stop all cpus to guaranntee there is no user
1565 of zonelist */
1566 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1567 /* cpuset refresh routine should be here */
1568 }
1569 vm_total_pages = nr_free_pagecache_pages();
1570 printk("Built %i zonelists. Total pages: %ld\n",
1571 num_online_nodes(), vm_total_pages);
1710} 1572}
1711 1573
1712/* 1574/*
@@ -1722,7 +1584,8 @@ void __init build_all_zonelists(void)
1722 */ 1584 */
1723#define PAGES_PER_WAITQUEUE 256 1585#define PAGES_PER_WAITQUEUE 256
1724 1586
1725static inline unsigned long wait_table_size(unsigned long pages) 1587#ifndef CONFIG_MEMORY_HOTPLUG
1588static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1726{ 1589{
1727 unsigned long size = 1; 1590 unsigned long size = 1;
1728 1591
@@ -1740,6 +1603,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
1740 1603
1741 return max(size, 4UL); 1604 return max(size, 4UL);
1742} 1605}
1606#else
1607/*
1608 * A zone's size might be changed by hot-add, so it is not possible to determine
1609 * a suitable size for its wait_table. So we use the maximum size now.
1610 *
1611 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1612 *
1613 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
1614 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1615 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1616 *
1617 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1618 * or more by the traditional way. (See above). It equals:
1619 *
1620 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1621 * ia64(16K page size) : = ( 8G + 4M)byte.
1622 * powerpc (64K page size) : = (32G +16M)byte.
1623 */
1624static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1625{
1626 return 4096UL;
1627}
1628#endif
1743 1629
1744/* 1630/*
1745 * This is an integer logarithm so that shifts can be used later 1631 * This is an integer logarithm so that shifts can be used later
@@ -1964,7 +1850,7 @@ static inline void free_zone_pagesets(int cpu)
1964 } 1850 }
1965} 1851}
1966 1852
1967static int pageset_cpuup_callback(struct notifier_block *nfb, 1853static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1968 unsigned long action, 1854 unsigned long action,
1969 void *hcpu) 1855 void *hcpu)
1970{ 1856{
@@ -1986,7 +1872,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb,
1986 return ret; 1872 return ret;
1987} 1873}
1988 1874
1989static struct notifier_block pageset_notifier = 1875static struct notifier_block __cpuinitdata pageset_notifier =
1990 { &pageset_cpuup_callback, NULL, 0 }; 1876 { &pageset_cpuup_callback, NULL, 0 };
1991 1877
1992void __init setup_per_cpu_pageset(void) 1878void __init setup_per_cpu_pageset(void)
@@ -2005,23 +1891,46 @@ void __init setup_per_cpu_pageset(void)
2005#endif 1891#endif
2006 1892
2007static __meminit 1893static __meminit
2008void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1894int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2009{ 1895{
2010 int i; 1896 int i;
2011 struct pglist_data *pgdat = zone->zone_pgdat; 1897 struct pglist_data *pgdat = zone->zone_pgdat;
1898 size_t alloc_size;
2012 1899
2013 /* 1900 /*
2014 * The per-page waitqueue mechanism uses hashed waitqueues 1901 * The per-page waitqueue mechanism uses hashed waitqueues
2015 * per zone. 1902 * per zone.
2016 */ 1903 */
2017 zone->wait_table_size = wait_table_size(zone_size_pages); 1904 zone->wait_table_hash_nr_entries =
2018 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 1905 wait_table_hash_nr_entries(zone_size_pages);
2019 zone->wait_table = (wait_queue_head_t *) 1906 zone->wait_table_bits =
2020 alloc_bootmem_node(pgdat, zone->wait_table_size 1907 wait_table_bits(zone->wait_table_hash_nr_entries);
2021 * sizeof(wait_queue_head_t)); 1908 alloc_size = zone->wait_table_hash_nr_entries
1909 * sizeof(wait_queue_head_t);
1910
1911 if (system_state == SYSTEM_BOOTING) {
1912 zone->wait_table = (wait_queue_head_t *)
1913 alloc_bootmem_node(pgdat, alloc_size);
1914 } else {
1915 /*
1916 * This case means that a zone whose size was 0 gets new memory
1917 * via memory hot-add.
1918 * But it may be the case that a new node was hot-added. In
1919 * this case vmalloc() will not be able to use this new node's
1920 * memory - this wait_table must be initialized to use this new
1921 * node itself as well.
1922 * To use this new node's memory, further consideration will be
1923 * necessary.
1924 */
1925 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
1926 }
1927 if (!zone->wait_table)
1928 return -ENOMEM;
2022 1929
2023 for(i = 0; i < zone->wait_table_size; ++i) 1930 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2024 init_waitqueue_head(zone->wait_table + i); 1931 init_waitqueue_head(zone->wait_table + i);
1932
1933 return 0;
2025} 1934}
2026 1935
2027static __meminit void zone_pcp_init(struct zone *zone) 1936static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +1952,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
2043 zone->name, zone->present_pages, batch); 1952 zone->name, zone->present_pages, batch);
2044} 1953}
2045 1954
2046static __meminit void init_currently_empty_zone(struct zone *zone, 1955__meminit int init_currently_empty_zone(struct zone *zone,
2047 unsigned long zone_start_pfn, unsigned long size) 1956 unsigned long zone_start_pfn,
1957 unsigned long size)
2048{ 1958{
2049 struct pglist_data *pgdat = zone->zone_pgdat; 1959 struct pglist_data *pgdat = zone->zone_pgdat;
2050 1960 int ret;
2051 zone_wait_table_init(zone, size); 1961 ret = zone_wait_table_init(zone, size);
1962 if (ret)
1963 return ret;
2052 pgdat->nr_zones = zone_idx(zone) + 1; 1964 pgdat->nr_zones = zone_idx(zone) + 1;
2053 1965
2054 zone->zone_start_pfn = zone_start_pfn; 1966 zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +1968,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2056 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 1968 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2057 1969
2058 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1970 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1971
1972 return 0;
2059} 1973}
2060 1974
2061/* 1975/*
@@ -2064,12 +1978,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2064 * - mark all memory queues empty 1978 * - mark all memory queues empty
2065 * - clear the memory bitmaps 1979 * - clear the memory bitmaps
2066 */ 1980 */
2067static void __init free_area_init_core(struct pglist_data *pgdat, 1981static void __meminit free_area_init_core(struct pglist_data *pgdat,
2068 unsigned long *zones_size, unsigned long *zholes_size) 1982 unsigned long *zones_size, unsigned long *zholes_size)
2069{ 1983{
2070 unsigned long j; 1984 unsigned long j;
2071 int nid = pgdat->node_id; 1985 int nid = pgdat->node_id;
2072 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1986 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1987 int ret;
2073 1988
2074 pgdat_resize_init(pgdat); 1989 pgdat_resize_init(pgdat);
2075 pgdat->nr_zones = 0; 1990 pgdat->nr_zones = 0;
@@ -2106,12 +2021,14 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
2106 zone->nr_scan_inactive = 0; 2021 zone->nr_scan_inactive = 0;
2107 zone->nr_active = 0; 2022 zone->nr_active = 0;
2108 zone->nr_inactive = 0; 2023 zone->nr_inactive = 0;
2024 zap_zone_vm_stats(zone);
2109 atomic_set(&zone->reclaim_in_progress, 0); 2025 atomic_set(&zone->reclaim_in_progress, 0);
2110 if (!size) 2026 if (!size)
2111 continue; 2027 continue;
2112 2028
2113 zonetable_add(zone, nid, j, zone_start_pfn, size); 2029 zonetable_add(zone, nid, j, zone_start_pfn, size);
2114 init_currently_empty_zone(zone, zone_start_pfn, size); 2030 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2031 BUG_ON(ret);
2115 zone_start_pfn += size; 2032 zone_start_pfn += size;
2116 } 2033 }
2117} 2034}
@@ -2152,7 +2069,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2152#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2069#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2153} 2070}
2154 2071
2155void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2072void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2156 unsigned long *zones_size, unsigned long node_start_pfn, 2073 unsigned long *zones_size, unsigned long node_start_pfn,
2157 unsigned long *zholes_size) 2074 unsigned long *zholes_size)
2158{ 2075{
@@ -2178,307 +2095,18 @@ void __init free_area_init(unsigned long *zones_size)
2178 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2095 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2179} 2096}
2180 2097
2181#ifdef CONFIG_PROC_FS
2182
2183#include <linux/seq_file.h>
2184
2185static void *frag_start(struct seq_file *m, loff_t *pos)
2186{
2187 pg_data_t *pgdat;
2188 loff_t node = *pos;
2189 for (pgdat = first_online_pgdat();
2190 pgdat && node;
2191 pgdat = next_online_pgdat(pgdat))
2192 --node;
2193
2194 return pgdat;
2195}
2196
2197static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
2198{
2199 pg_data_t *pgdat = (pg_data_t *)arg;
2200
2201 (*pos)++;
2202 return next_online_pgdat(pgdat);
2203}
2204
2205static void frag_stop(struct seq_file *m, void *arg)
2206{
2207}
2208
2209/*
2210 * This walks the free areas for each zone.
2211 */
2212static int frag_show(struct seq_file *m, void *arg)
2213{
2214 pg_data_t *pgdat = (pg_data_t *)arg;
2215 struct zone *zone;
2216 struct zone *node_zones = pgdat->node_zones;
2217 unsigned long flags;
2218 int order;
2219
2220 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2221 if (!populated_zone(zone))
2222 continue;
2223
2224 spin_lock_irqsave(&zone->lock, flags);
2225 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
2226 for (order = 0; order < MAX_ORDER; ++order)
2227 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
2228 spin_unlock_irqrestore(&zone->lock, flags);
2229 seq_putc(m, '\n');
2230 }
2231 return 0;
2232}
2233
2234struct seq_operations fragmentation_op = {
2235 .start = frag_start,
2236 .next = frag_next,
2237 .stop = frag_stop,
2238 .show = frag_show,
2239};
2240
2241/*
2242 * Output information about zones in @pgdat.
2243 */
2244static int zoneinfo_show(struct seq_file *m, void *arg)
2245{
2246 pg_data_t *pgdat = arg;
2247 struct zone *zone;
2248 struct zone *node_zones = pgdat->node_zones;
2249 unsigned long flags;
2250
2251 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2252 int i;
2253
2254 if (!populated_zone(zone))
2255 continue;
2256
2257 spin_lock_irqsave(&zone->lock, flags);
2258 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
2259 seq_printf(m,
2260 "\n pages free %lu"
2261 "\n min %lu"
2262 "\n low %lu"
2263 "\n high %lu"
2264 "\n active %lu"
2265 "\n inactive %lu"
2266 "\n scanned %lu (a: %lu i: %lu)"
2267 "\n spanned %lu"
2268 "\n present %lu",
2269 zone->free_pages,
2270 zone->pages_min,
2271 zone->pages_low,
2272 zone->pages_high,
2273 zone->nr_active,
2274 zone->nr_inactive,
2275 zone->pages_scanned,
2276 zone->nr_scan_active, zone->nr_scan_inactive,
2277 zone->spanned_pages,
2278 zone->present_pages);
2279 seq_printf(m,
2280 "\n protection: (%lu",
2281 zone->lowmem_reserve[0]);
2282 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2283 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2284 seq_printf(m,
2285 ")"
2286 "\n pagesets");
2287 for_each_online_cpu(i) {
2288 struct per_cpu_pageset *pageset;
2289 int j;
2290
2291 pageset = zone_pcp(zone, i);
2292 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2293 if (pageset->pcp[j].count)
2294 break;
2295 }
2296 if (j == ARRAY_SIZE(pageset->pcp))
2297 continue;
2298 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2299 seq_printf(m,
2300 "\n cpu: %i pcp: %i"
2301 "\n count: %i"
2302 "\n high: %i"
2303 "\n batch: %i",
2304 i, j,
2305 pageset->pcp[j].count,
2306 pageset->pcp[j].high,
2307 pageset->pcp[j].batch);
2308 }
2309#ifdef CONFIG_NUMA
2310 seq_printf(m,
2311 "\n numa_hit: %lu"
2312 "\n numa_miss: %lu"
2313 "\n numa_foreign: %lu"
2314 "\n interleave_hit: %lu"
2315 "\n local_node: %lu"
2316 "\n other_node: %lu",
2317 pageset->numa_hit,
2318 pageset->numa_miss,
2319 pageset->numa_foreign,
2320 pageset->interleave_hit,
2321 pageset->local_node,
2322 pageset->other_node);
2323#endif
2324 }
2325 seq_printf(m,
2326 "\n all_unreclaimable: %u"
2327 "\n prev_priority: %i"
2328 "\n temp_priority: %i"
2329 "\n start_pfn: %lu",
2330 zone->all_unreclaimable,
2331 zone->prev_priority,
2332 zone->temp_priority,
2333 zone->zone_start_pfn);
2334 spin_unlock_irqrestore(&zone->lock, flags);
2335 seq_putc(m, '\n');
2336 }
2337 return 0;
2338}
2339
2340struct seq_operations zoneinfo_op = {
2341 .start = frag_start, /* iterate over all zones. The same as in
2342 * fragmentation. */
2343 .next = frag_next,
2344 .stop = frag_stop,
2345 .show = zoneinfo_show,
2346};
2347
2348static char *vmstat_text[] = {
2349 "nr_dirty",
2350 "nr_writeback",
2351 "nr_unstable",
2352 "nr_page_table_pages",
2353 "nr_mapped",
2354 "nr_slab",
2355
2356 "pgpgin",
2357 "pgpgout",
2358 "pswpin",
2359 "pswpout",
2360
2361 "pgalloc_high",
2362 "pgalloc_normal",
2363 "pgalloc_dma32",
2364 "pgalloc_dma",
2365
2366 "pgfree",
2367 "pgactivate",
2368 "pgdeactivate",
2369
2370 "pgfault",
2371 "pgmajfault",
2372
2373 "pgrefill_high",
2374 "pgrefill_normal",
2375 "pgrefill_dma32",
2376 "pgrefill_dma",
2377
2378 "pgsteal_high",
2379 "pgsteal_normal",
2380 "pgsteal_dma32",
2381 "pgsteal_dma",
2382
2383 "pgscan_kswapd_high",
2384 "pgscan_kswapd_normal",
2385 "pgscan_kswapd_dma32",
2386 "pgscan_kswapd_dma",
2387
2388 "pgscan_direct_high",
2389 "pgscan_direct_normal",
2390 "pgscan_direct_dma32",
2391 "pgscan_direct_dma",
2392
2393 "pginodesteal",
2394 "slabs_scanned",
2395 "kswapd_steal",
2396 "kswapd_inodesteal",
2397 "pageoutrun",
2398 "allocstall",
2399
2400 "pgrotated",
2401 "nr_bounce",
2402};
2403
2404static void *vmstat_start(struct seq_file *m, loff_t *pos)
2405{
2406 struct page_state *ps;
2407
2408 if (*pos >= ARRAY_SIZE(vmstat_text))
2409 return NULL;
2410
2411 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
2412 m->private = ps;
2413 if (!ps)
2414 return ERR_PTR(-ENOMEM);
2415 get_full_page_state(ps);
2416 ps->pgpgin /= 2; /* sectors -> kbytes */
2417 ps->pgpgout /= 2;
2418 return (unsigned long *)ps + *pos;
2419}
2420
2421static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
2422{
2423 (*pos)++;
2424 if (*pos >= ARRAY_SIZE(vmstat_text))
2425 return NULL;
2426 return (unsigned long *)m->private + *pos;
2427}
2428
2429static int vmstat_show(struct seq_file *m, void *arg)
2430{
2431 unsigned long *l = arg;
2432 unsigned long off = l - (unsigned long *)m->private;
2433
2434 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
2435 return 0;
2436}
2437
2438static void vmstat_stop(struct seq_file *m, void *arg)
2439{
2440 kfree(m->private);
2441 m->private = NULL;
2442}
2443
2444struct seq_operations vmstat_op = {
2445 .start = vmstat_start,
2446 .next = vmstat_next,
2447 .stop = vmstat_stop,
2448 .show = vmstat_show,
2449};
2450
2451#endif /* CONFIG_PROC_FS */
2452
2453#ifdef CONFIG_HOTPLUG_CPU 2098#ifdef CONFIG_HOTPLUG_CPU
2454static int page_alloc_cpu_notify(struct notifier_block *self, 2099static int page_alloc_cpu_notify(struct notifier_block *self,
2455 unsigned long action, void *hcpu) 2100 unsigned long action, void *hcpu)
2456{ 2101{
2457 int cpu = (unsigned long)hcpu; 2102 int cpu = (unsigned long)hcpu;
2458 long *count;
2459 unsigned long *src, *dest;
2460 2103
2461 if (action == CPU_DEAD) { 2104 if (action == CPU_DEAD) {
2462 int i;
2463
2464 /* Drain local pagecache count. */
2465 count = &per_cpu(nr_pagecache_local, cpu);
2466 atomic_add(*count, &nr_pagecache);
2467 *count = 0;
2468 local_irq_disable(); 2105 local_irq_disable();
2469 __drain_pages(cpu); 2106 __drain_pages(cpu);
2470 2107 vm_events_fold_cpu(cpu);
2471 /* Add dead cpu's page_states to our own. */
2472 dest = (unsigned long *)&__get_cpu_var(page_states);
2473 src = (unsigned long *)&per_cpu(page_states, cpu);
2474
2475 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2476 i++) {
2477 dest[i] += src[i];
2478 src[i] = 0;
2479 }
2480
2481 local_irq_enable(); 2108 local_irq_enable();
2109 refresh_cpu_vm_stats(cpu);
2482 } 2110 }
2483 return NOTIFY_OK; 2111 return NOTIFY_OK;
2484} 2112}
@@ -2804,42 +2432,14 @@ void *__init alloc_large_system_hash(const char *tablename,
2804} 2432}
2805 2433
2806#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 2434#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2807/*
2808 * pfn <-> page translation. out-of-line version.
2809 * (see asm-generic/memory_model.h)
2810 */
2811#if defined(CONFIG_FLATMEM)
2812struct page *pfn_to_page(unsigned long pfn)
2813{
2814 return mem_map + (pfn - ARCH_PFN_OFFSET);
2815}
2816unsigned long page_to_pfn(struct page *page)
2817{
2818 return (page - mem_map) + ARCH_PFN_OFFSET;
2819}
2820#elif defined(CONFIG_DISCONTIGMEM)
2821struct page *pfn_to_page(unsigned long pfn)
2822{
2823 int nid = arch_pfn_to_nid(pfn);
2824 return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
2825}
2826unsigned long page_to_pfn(struct page *page)
2827{
2828 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
2829 return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
2830}
2831#elif defined(CONFIG_SPARSEMEM)
2832struct page *pfn_to_page(unsigned long pfn) 2435struct page *pfn_to_page(unsigned long pfn)
2833{ 2436{
2834 return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; 2437 return __pfn_to_page(pfn);
2835} 2438}
2836
2837unsigned long page_to_pfn(struct page *page) 2439unsigned long page_to_pfn(struct page *page)
2838{ 2440{
2839 long section_id = page_to_section(page); 2441 return __page_to_pfn(page);
2840 return page - __section_mem_map_addr(__nr_to_section(section_id));
2841} 2442}
2842#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
2843EXPORT_SYMBOL(pfn_to_page); 2443EXPORT_SYMBOL(pfn_to_page);
2844EXPORT_SYMBOL(page_to_pfn); 2444EXPORT_SYMBOL(page_to_pfn);
2845#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 2445#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/page_io.c b/mm/page_io.c
index bb2b0d53889c..88029948d00a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
101 } 101 }
102 if (wbc->sync_mode == WB_SYNC_ALL) 102 if (wbc->sync_mode == WB_SYNC_ALL)
103 rw |= (1 << BIO_RW_SYNC); 103 rw |= (1 << BIO_RW_SYNC);
104 inc_page_state(pswpout); 104 count_vm_event(PSWPOUT);
105 set_page_writeback(page); 105 set_page_writeback(page);
106 unlock_page(page); 106 unlock_page(page);
107 submit_bio(rw, bio); 107 submit_bio(rw, bio);
@@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page)
123 ret = -ENOMEM; 123 ret = -ENOMEM;
124 goto out; 124 goto out;
125 } 125 }
126 inc_page_state(pswpin); 126 count_vm_event(PSWPIN);
127 submit_bio(READ, bio); 127 submit_bio(READ, bio);
128out: 128out:
129 return ret; 129 return ret;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index c4b6d0afd736..b02102feeb4b 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work)
104 list_move(&my_work->list, &pdflush_list); 104 list_move(&my_work->list, &pdflush_list);
105 my_work->when_i_went_to_sleep = jiffies; 105 my_work->when_i_went_to_sleep = jiffies;
106 spin_unlock_irq(&pdflush_lock); 106 spin_unlock_irq(&pdflush_lock);
107
108 schedule(); 107 schedule();
109 if (try_to_freeze()) { 108 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 continue;
112 }
113
114 spin_lock_irq(&pdflush_lock); 109 spin_lock_irq(&pdflush_lock);
115 if (!list_empty(&my_work->list)) { 110 if (!list_empty(&my_work->list)) {
116 printk("pdflush: bogus wakeup!\n"); 111 /*
112 * Someone woke us up, but without removing our control
113 * structure from the global list. swsusp will do this
114 * in try_to_freeze()->refrigerator(). Handle it.
115 */
117 my_work->fn = NULL; 116 my_work->fn = NULL;
118 continue; 117 continue;
119 } 118 }
120 if (my_work->fn == NULL) { 119 if (my_work->fn == NULL) {
121 printk("pdflush: NULL work function\n"); 120 printk("pdflush: bogus wakeup\n");
122 continue; 121 continue;
123 } 122 }
124 spin_unlock_irq(&pdflush_lock); 123 spin_unlock_irq(&pdflush_lock);
@@ -202,8 +201,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
202 unsigned long flags; 201 unsigned long flags;
203 int ret = 0; 202 int ret = 0;
204 203
205 if (fn == NULL) 204 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
206 BUG(); /* Hard to diagnose if it's deferred */
207 205
208 spin_lock_irqsave(&pdflush_lock, flags); 206 spin_lock_irqsave(&pdflush_lock, flags);
209 if (list_empty(&pdflush_list)) { 207 if (list_empty(&pdflush_list)) {
diff --git a/mm/readahead.c b/mm/readahead.c
index ba7db816f4c8..1ba736ac0367 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -119,8 +119,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
119#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 119#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
120 120
121/** 121/**
122 * read_cache_pages - populate an address space with some pages, and 122 * read_cache_pages - populate an address space with some pages & start reads against them
123 * start reads against them.
124 * @mapping: the address_space 123 * @mapping: the address_space
125 * @pages: The address of a list_head which contains the target pages. These 124 * @pages: The address of a list_head which contains the target pages. These
126 * pages have their ->index populated and are otherwise uninitialised. 125 * pages have their ->index populated and are otherwise uninitialised.
@@ -183,14 +182,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
183 list_del(&page->lru); 182 list_del(&page->lru);
184 if (!add_to_page_cache(page, mapping, 183 if (!add_to_page_cache(page, mapping,
185 page->index, GFP_KERNEL)) { 184 page->index, GFP_KERNEL)) {
186 ret = mapping->a_ops->readpage(filp, page); 185 mapping->a_ops->readpage(filp, page);
187 if (ret != AOP_TRUNCATED_PAGE) { 186 if (!pagevec_add(&lru_pvec, page))
188 if (!pagevec_add(&lru_pvec, page)) 187 __pagevec_lru_add(&lru_pvec);
189 __pagevec_lru_add(&lru_pvec); 188 } else
190 continue; 189 page_cache_release(page);
191 } /* else fall through to release */
192 }
193 page_cache_release(page);
194 } 190 }
195 pagevec_lru_add(&lru_pvec); 191 pagevec_lru_add(&lru_pvec);
196 ret = 0; 192 ret = 0;
@@ -395,8 +391,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
395 * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' 391 * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
396 * is set wait till the read completes. Otherwise attempt to read without 392 * is set wait till the read completes. Otherwise attempt to read without
397 * blocking. 393 * blocking.
398 * Returns 1 meaning 'success' if read is succesfull without switching off 394 * Returns 1 meaning 'success' if read is successful without switching off
399 * readhaead mode. Otherwise return failure. 395 * readahead mode. Otherwise return failure.
400 */ 396 */
401static int 397static int
402blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, 398blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
diff --git a/mm/rmap.c b/mm/rmap.c
index 1963e269314d..40158b59729e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
103 spin_lock(&mm->page_table_lock); 103 spin_lock(&mm->page_table_lock);
104 if (likely(!vma->anon_vma)) { 104 if (likely(!vma->anon_vma)) {
105 vma->anon_vma = anon_vma; 105 vma->anon_vma = anon_vma;
106 list_add(&vma->anon_vma_node, &anon_vma->head); 106 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
107 allocated = NULL; 107 allocated = NULL;
108 } 108 }
109 spin_unlock(&mm->page_table_lock); 109 spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
127 struct anon_vma *anon_vma = vma->anon_vma; 127 struct anon_vma *anon_vma = vma->anon_vma;
128 128
129 if (anon_vma) { 129 if (anon_vma) {
130 list_add(&vma->anon_vma_node, &anon_vma->head); 130 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
131 validate_anon_vma(vma); 131 validate_anon_vma(vma);
132 } 132 }
133} 133}
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
138 138
139 if (anon_vma) { 139 if (anon_vma) {
140 spin_lock(&anon_vma->lock); 140 spin_lock(&anon_vma->lock);
141 list_add(&vma->anon_vma_node, &anon_vma->head); 141 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
142 validate_anon_vma(vma); 142 validate_anon_vma(vma);
143 spin_unlock(&anon_vma->lock); 143 spin_unlock(&anon_vma->lock);
144 } 144 }
@@ -205,44 +205,6 @@ out:
205 return anon_vma; 205 return anon_vma;
206} 206}
207 207
208#ifdef CONFIG_MIGRATION
209/*
210 * Remove an anonymous page from swap replacing the swap pte's
211 * through real pte's pointing to valid pages and then releasing
212 * the page from the swap cache.
213 *
214 * Must hold page lock on page and mmap_sem of one vma that contains
215 * the page.
216 */
217void remove_from_swap(struct page *page)
218{
219 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma;
221 unsigned long mapping;
222
223 if (!PageSwapCache(page))
224 return;
225
226 mapping = (unsigned long)page->mapping;
227
228 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
229 return;
230
231 /*
232 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
233 */
234 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
235 spin_lock(&anon_vma->lock);
236
237 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
238 remove_vma_swap(vma, page);
239
240 spin_unlock(&anon_vma->lock);
241 delete_from_swap_cache(page);
242}
243EXPORT_SYMBOL(remove_from_swap);
244#endif
245
246/* 208/*
247 * At what user virtual address is page expected in vma? 209 * At what user virtual address is page expected in vma?
248 */ 210 */
@@ -493,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page,
493 * nr_mapped state can be updated without turning off 455 * nr_mapped state can be updated without turning off
494 * interrupts because it is not modified via interrupt. 456 * interrupts because it is not modified via interrupt.
495 */ 457 */
496 __inc_page_state(nr_mapped); 458 __inc_zone_page_state(page, NR_ANON_PAGES);
497} 459}
498 460
499/** 461/**
@@ -537,7 +499,7 @@ void page_add_new_anon_rmap(struct page *page,
537void page_add_file_rmap(struct page *page) 499void page_add_file_rmap(struct page *page)
538{ 500{
539 if (atomic_inc_and_test(&page->_mapcount)) 501 if (atomic_inc_and_test(&page->_mapcount))
540 __inc_page_state(nr_mapped); 502 __inc_zone_page_state(page, NR_FILE_MAPPED);
541} 503}
542 504
543/** 505/**
@@ -569,7 +531,8 @@ void page_remove_rmap(struct page *page)
569 */ 531 */
570 if (page_test_and_clear_dirty(page)) 532 if (page_test_and_clear_dirty(page))
571 set_page_dirty(page); 533 set_page_dirty(page);
572 __dec_page_state(nr_mapped); 534 __dec_zone_page_state(page,
535 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
573 } 536 }
574} 537}
575 538
@@ -578,7 +541,7 @@ void page_remove_rmap(struct page *page)
578 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 541 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
579 */ 542 */
580static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 543static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
581 int ignore_refs) 544 int migration)
582{ 545{
583 struct mm_struct *mm = vma->vm_mm; 546 struct mm_struct *mm = vma->vm_mm;
584 unsigned long address; 547 unsigned long address;
@@ -600,9 +563,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
600 * If it's recently referenced (perhaps page_referenced 563 * If it's recently referenced (perhaps page_referenced
601 * skipped over this mm) then we should reactivate it. 564 * skipped over this mm) then we should reactivate it.
602 */ 565 */
603 if ((vma->vm_flags & VM_LOCKED) || 566 if (!migration && ((vma->vm_flags & VM_LOCKED) ||
604 (ptep_clear_flush_young(vma, address, pte) 567 (ptep_clear_flush_young(vma, address, pte)))) {
605 && !ignore_refs)) {
606 ret = SWAP_FAIL; 568 ret = SWAP_FAIL;
607 goto out_unmap; 569 goto out_unmap;
608 } 570 }
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
620 582
621 if (PageAnon(page)) { 583 if (PageAnon(page)) {
622 swp_entry_t entry = { .val = page_private(page) }; 584 swp_entry_t entry = { .val = page_private(page) };
623 /* 585
624 * Store the swap location in the pte. 586 if (PageSwapCache(page)) {
625 * See handle_pte_fault() ... 587 /*
626 */ 588 * Store the swap location in the pte.
627 BUG_ON(!PageSwapCache(page)); 589 * See handle_pte_fault() ...
628 swap_duplicate(entry); 590 */
629 if (list_empty(&mm->mmlist)) { 591 swap_duplicate(entry);
630 spin_lock(&mmlist_lock); 592 if (list_empty(&mm->mmlist)) {
631 if (list_empty(&mm->mmlist)) 593 spin_lock(&mmlist_lock);
632 list_add(&mm->mmlist, &init_mm.mmlist); 594 if (list_empty(&mm->mmlist))
633 spin_unlock(&mmlist_lock); 595 list_add(&mm->mmlist, &init_mm.mmlist);
596 spin_unlock(&mmlist_lock);
597 }
598 dec_mm_counter(mm, anon_rss);
599#ifdef CONFIG_MIGRATION
600 } else {
601 /*
602 * Store the pfn of the page in a special migration
603 * pte. do_swap_page() will wait until the migration
604 * pte is removed and then restart fault handling.
605 */
606 BUG_ON(!migration);
607 entry = make_migration_entry(page, pte_write(pteval));
608#endif
634 } 609 }
635 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 610 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
636 BUG_ON(pte_file(*pte)); 611 BUG_ON(pte_file(*pte));
637 dec_mm_counter(mm, anon_rss);
638 } else 612 } else
613#ifdef CONFIG_MIGRATION
614 if (migration) {
615 /* Establish migration entry for a file page */
616 swp_entry_t entry;
617 entry = make_migration_entry(page, pte_write(pteval));
618 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
619 } else
620#endif
639 dec_mm_counter(mm, file_rss); 621 dec_mm_counter(mm, file_rss);
640 622
623
641 page_remove_rmap(page); 624 page_remove_rmap(page);
642 page_cache_release(page); 625 page_cache_release(page);
643 626
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
736 pte_unmap_unlock(pte - 1, ptl); 719 pte_unmap_unlock(pte - 1, ptl);
737} 720}
738 721
739static int try_to_unmap_anon(struct page *page, int ignore_refs) 722static int try_to_unmap_anon(struct page *page, int migration)
740{ 723{
741 struct anon_vma *anon_vma; 724 struct anon_vma *anon_vma;
742 struct vm_area_struct *vma; 725 struct vm_area_struct *vma;
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
747 return ret; 730 return ret;
748 731
749 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 732 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
750 ret = try_to_unmap_one(page, vma, ignore_refs); 733 ret = try_to_unmap_one(page, vma, migration);
751 if (ret == SWAP_FAIL || !page_mapped(page)) 734 if (ret == SWAP_FAIL || !page_mapped(page))
752 break; 735 break;
753 } 736 }
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
764 * 747 *
765 * This function is only called from try_to_unmap for object-based pages. 748 * This function is only called from try_to_unmap for object-based pages.
766 */ 749 */
767static int try_to_unmap_file(struct page *page, int ignore_refs) 750static int try_to_unmap_file(struct page *page, int migration)
768{ 751{
769 struct address_space *mapping = page->mapping; 752 struct address_space *mapping = page->mapping;
770 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 753 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
778 761
779 spin_lock(&mapping->i_mmap_lock); 762 spin_lock(&mapping->i_mmap_lock);
780 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 763 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
781 ret = try_to_unmap_one(page, vma, ignore_refs); 764 ret = try_to_unmap_one(page, vma, migration);
782 if (ret == SWAP_FAIL || !page_mapped(page)) 765 if (ret == SWAP_FAIL || !page_mapped(page))
783 goto out; 766 goto out;
784 } 767 }
@@ -788,7 +771,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
788 771
789 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 772 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
790 shared.vm_set.list) { 773 shared.vm_set.list) {
791 if (vma->vm_flags & VM_LOCKED) 774 if ((vma->vm_flags & VM_LOCKED) && !migration)
792 continue; 775 continue;
793 cursor = (unsigned long) vma->vm_private_data; 776 cursor = (unsigned long) vma->vm_private_data;
794 if (cursor > max_nl_cursor) 777 if (cursor > max_nl_cursor)
@@ -822,7 +805,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
822 do { 805 do {
823 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 806 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
824 shared.vm_set.list) { 807 shared.vm_set.list) {
825 if (vma->vm_flags & VM_LOCKED) 808 if ((vma->vm_flags & VM_LOCKED) && !migration)
826 continue; 809 continue;
827 cursor = (unsigned long) vma->vm_private_data; 810 cursor = (unsigned long) vma->vm_private_data;
828 while ( cursor < max_nl_cursor && 811 while ( cursor < max_nl_cursor &&
@@ -863,16 +846,16 @@ out:
863 * SWAP_AGAIN - we missed a mapping, try again later 846 * SWAP_AGAIN - we missed a mapping, try again later
864 * SWAP_FAIL - the page is unswappable 847 * SWAP_FAIL - the page is unswappable
865 */ 848 */
866int try_to_unmap(struct page *page, int ignore_refs) 849int try_to_unmap(struct page *page, int migration)
867{ 850{
868 int ret; 851 int ret;
869 852
870 BUG_ON(!PageLocked(page)); 853 BUG_ON(!PageLocked(page));
871 854
872 if (PageAnon(page)) 855 if (PageAnon(page))
873 ret = try_to_unmap_anon(page, ignore_refs); 856 ret = try_to_unmap_anon(page, migration);
874 else 857 else
875 ret = try_to_unmap_file(page, ignore_refs); 858 ret = try_to_unmap_file(page, migration);
876 859
877 if (!page_mapped(page)) 860 if (!page_mapped(page))
878 ret = SWAP_SUCCESS; 861 ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index 1e43c8a865ba..db21c51531ca 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -23,10 +23,8 @@
23 * which makes it a completely usable filesystem. 23 * which makes it a completely usable filesystem.
24 */ 24 */
25 25
26#include <linux/config.h>
27#include <linux/module.h> 26#include <linux/module.h>
28#include <linux/init.h> 27#include <linux/init.h>
29#include <linux/devfs_fs_kernel.h>
30#include <linux/fs.h> 28#include <linux/fs.h>
31#include <linux/mm.h> 29#include <linux/mm.h>
32#include <linux/mman.h> 30#include <linux/mman.h>
@@ -174,7 +172,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
174} 172}
175 173
176static struct super_operations shmem_ops; 174static struct super_operations shmem_ops;
177static struct address_space_operations shmem_aops; 175static const struct address_space_operations shmem_aops;
178static struct file_operations shmem_file_operations; 176static struct file_operations shmem_file_operations;
179static struct inode_operations shmem_inode_operations; 177static struct inode_operations shmem_inode_operations;
180static struct inode_operations shmem_dir_inode_operations; 178static struct inode_operations shmem_dir_inode_operations;
@@ -1046,12 +1044,12 @@ repeat:
1046 swappage = lookup_swap_cache(swap); 1044 swappage = lookup_swap_cache(swap);
1047 if (!swappage) { 1045 if (!swappage) {
1048 shmem_swp_unmap(entry); 1046 shmem_swp_unmap(entry);
1049 spin_unlock(&info->lock);
1050 /* here we actually do the io */ 1047 /* here we actually do the io */
1051 if (type && *type == VM_FAULT_MINOR) { 1048 if (type && *type == VM_FAULT_MINOR) {
1052 inc_page_state(pgmajfault); 1049 __count_vm_event(PGMAJFAULT);
1053 *type = VM_FAULT_MAJOR; 1050 *type = VM_FAULT_MAJOR;
1054 } 1051 }
1052 spin_unlock(&info->lock);
1055 swappage = shmem_swapin(info, swap, idx); 1053 swappage = shmem_swapin(info, swap, idx);
1056 if (!swappage) { 1054 if (!swappage) {
1057 spin_lock(&info->lock); 1055 spin_lock(&info->lock);
@@ -1081,14 +1079,6 @@ repeat:
1081 page_cache_release(swappage); 1079 page_cache_release(swappage);
1082 goto repeat; 1080 goto repeat;
1083 } 1081 }
1084 if (!PageSwapCache(swappage)) {
1085 /* Page migration has occured */
1086 shmem_swp_unmap(entry);
1087 spin_unlock(&info->lock);
1088 unlock_page(swappage);
1089 page_cache_release(swappage);
1090 goto repeat;
1091 }
1092 if (PageWriteback(swappage)) { 1082 if (PageWriteback(swappage)) {
1093 shmem_swp_unmap(entry); 1083 shmem_swp_unmap(entry);
1094 spin_unlock(&info->lock); 1084 spin_unlock(&info->lock);
@@ -1654,9 +1644,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1654 return desc.error; 1644 return desc.error;
1655} 1645}
1656 1646
1657static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) 1647static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1658{ 1648{
1659 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1649 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1660 1650
1661 buf->f_type = TMPFS_MAGIC; 1651 buf->f_type = TMPFS_MAGIC;
1662 buf->f_bsize = PAGE_CACHE_SIZE; 1652 buf->f_bsize = PAGE_CACHE_SIZE;
@@ -2170,7 +2160,7 @@ static void destroy_inodecache(void)
2170 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); 2160 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2171} 2161}
2172 2162
2173static struct address_space_operations shmem_aops = { 2163static const struct address_space_operations shmem_aops = {
2174 .writepage = shmem_writepage, 2164 .writepage = shmem_writepage,
2175 .set_page_dirty = __set_page_dirty_nobuffers, 2165 .set_page_dirty = __set_page_dirty_nobuffers,
2176#ifdef CONFIG_TMPFS 2166#ifdef CONFIG_TMPFS
@@ -2233,10 +2223,10 @@ static struct vm_operations_struct shmem_vm_ops = {
2233}; 2223};
2234 2224
2235 2225
2236static struct super_block *shmem_get_sb(struct file_system_type *fs_type, 2226static int shmem_get_sb(struct file_system_type *fs_type,
2237 int flags, const char *dev_name, void *data) 2227 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2238{ 2228{
2239 return get_sb_nodev(fs_type, flags, data, shmem_fill_super); 2229 return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
2240} 2230}
2241 2231
2242static struct file_system_type tmpfs_fs_type = { 2232static struct file_system_type tmpfs_fs_type = {
@@ -2260,10 +2250,8 @@ static int __init init_tmpfs(void)
2260 printk(KERN_ERR "Could not register tmpfs\n"); 2250 printk(KERN_ERR "Could not register tmpfs\n");
2261 goto out2; 2251 goto out2;
2262 } 2252 }
2263#ifdef CONFIG_TMPFS 2253
2264 devfs_mk_dir("shm"); 2254 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2265#endif
2266 shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
2267 tmpfs_fs_type.name, NULL); 2255 tmpfs_fs_type.name, NULL);
2268 if (IS_ERR(shm_mnt)) { 2256 if (IS_ERR(shm_mnt)) {
2269 error = PTR_ERR(shm_mnt); 2257 error = PTR_ERR(shm_mnt);
diff --git a/mm/slab.c b/mm/slab.c
index f1b644eb39d8..3936af344542 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
89#include <linux/config.h> 89#include <linux/config.h>
90#include <linux/slab.h> 90#include <linux/slab.h>
91#include <linux/mm.h> 91#include <linux/mm.h>
92#include <linux/poison.h>
92#include <linux/swap.h> 93#include <linux/swap.h>
93#include <linux/cache.h> 94#include <linux/cache.h>
94#include <linux/interrupt.h> 95#include <linux/interrupt.h>
@@ -106,6 +107,7 @@
106#include <linux/nodemask.h> 107#include <linux/nodemask.h>
107#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
108#include <linux/mutex.h> 109#include <linux/mutex.h>
110#include <linux/rtmutex.h>
109 111
110#include <asm/uaccess.h> 112#include <asm/uaccess.h>
111#include <asm/cacheflush.h> 113#include <asm/cacheflush.h>
@@ -307,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
307#define SIZE_AC 1 309#define SIZE_AC 1
308#define SIZE_L3 (1 + MAX_NUMNODES) 310#define SIZE_L3 (1 + MAX_NUMNODES)
309 311
312static int drain_freelist(struct kmem_cache *cache,
313 struct kmem_list3 *l3, int tofree);
314static void free_block(struct kmem_cache *cachep, void **objpp, int len,
315 int node);
316static void enable_cpucache(struct kmem_cache *cachep);
317static void cache_reap(void *unused);
318
310/* 319/*
311 * This function must be completely optimized away if a constant is passed to 320 * This function must be completely optimized away if a constant is passed to
312 * it. Mostly the same as what is in linux/slab.h except it returns an index. 321 * it. Mostly the same as what is in linux/slab.h except it returns an index.
@@ -331,6 +340,8 @@ static __always_inline int index_of(const size_t size)
331 return 0; 340 return 0;
332} 341}
333 342
343static int slab_early_init = 1;
344
334#define INDEX_AC index_of(sizeof(struct arraycache_init)) 345#define INDEX_AC index_of(sizeof(struct arraycache_init))
335#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 346#define INDEX_L3 index_of(sizeof(struct kmem_list3))
336 347
@@ -452,7 +463,7 @@ struct kmem_cache {
452#define STATS_DEC_ACTIVE(x) ((x)->num_active--) 463#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
453#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 464#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
454#define STATS_INC_GROWN(x) ((x)->grown++) 465#define STATS_INC_GROWN(x) ((x)->grown++)
455#define STATS_INC_REAPED(x) ((x)->reaped++) 466#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
456#define STATS_SET_HIGH(x) \ 467#define STATS_SET_HIGH(x) \
457 do { \ 468 do { \
458 if ((x)->num_active > (x)->high_mark) \ 469 if ((x)->num_active > (x)->high_mark) \
@@ -476,7 +487,7 @@ struct kmem_cache {
476#define STATS_DEC_ACTIVE(x) do { } while (0) 487#define STATS_DEC_ACTIVE(x) do { } while (0)
477#define STATS_INC_ALLOCED(x) do { } while (0) 488#define STATS_INC_ALLOCED(x) do { } while (0)
478#define STATS_INC_GROWN(x) do { } while (0) 489#define STATS_INC_GROWN(x) do { } while (0)
479#define STATS_INC_REAPED(x) do { } while (0) 490#define STATS_ADD_REAPED(x,y) do { } while (0)
480#define STATS_SET_HIGH(x) do { } while (0) 491#define STATS_SET_HIGH(x) do { } while (0)
481#define STATS_INC_ERR(x) do { } while (0) 492#define STATS_INC_ERR(x) do { } while (0)
482#define STATS_INC_NODEALLOCS(x) do { } while (0) 493#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -490,17 +501,6 @@ struct kmem_cache {
490#endif 501#endif
491 502
492#if DEBUG 503#if DEBUG
493/*
494 * Magic nums for obj red zoning.
495 * Placed in the first word before and the first word after an obj.
496 */
497#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
498#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */
499
500/* ...and for poisoning */
501#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
502#define POISON_FREE 0x6b /* for use-after-free poisoning */
503#define POISON_END 0xa5 /* end-byte of poisoning */
504 504
505/* 505/*
506 * memory layout of objects: 506 * memory layout of objects:
@@ -592,6 +592,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page)
592{ 592{
593 if (unlikely(PageCompound(page))) 593 if (unlikely(PageCompound(page)))
594 page = (struct page *)page_private(page); 594 page = (struct page *)page_private(page);
595 BUG_ON(!PageSlab(page));
595 return (struct kmem_cache *)page->lru.next; 596 return (struct kmem_cache *)page->lru.next;
596} 597}
597 598
@@ -604,6 +605,7 @@ static inline struct slab *page_get_slab(struct page *page)
604{ 605{
605 if (unlikely(PageCompound(page))) 606 if (unlikely(PageCompound(page)))
606 page = (struct page *)page_private(page); 607 page = (struct page *)page_private(page);
608 BUG_ON(!PageSlab(page));
607 return (struct slab *)page->lru.prev; 609 return (struct slab *)page->lru.prev;
608} 610}
609 611
@@ -705,12 +707,6 @@ int slab_is_available(void)
705 707
706static DEFINE_PER_CPU(struct work_struct, reap_work); 708static DEFINE_PER_CPU(struct work_struct, reap_work);
707 709
708static void free_block(struct kmem_cache *cachep, void **objpp, int len,
709 int node);
710static void enable_cpucache(struct kmem_cache *cachep);
711static void cache_reap(void *unused);
712static int __node_shrink(struct kmem_cache *cachep, int node);
713
714static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 710static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
715{ 711{
716 return cachep->array[smp_processor_id()]; 712 return cachep->array[smp_processor_id()];
@@ -1024,6 +1020,40 @@ static void drain_alien_cache(struct kmem_cache *cachep,
1024 } 1020 }
1025 } 1021 }
1026} 1022}
1023
1024static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1025{
1026 struct slab *slabp = virt_to_slab(objp);
1027 int nodeid = slabp->nodeid;
1028 struct kmem_list3 *l3;
1029 struct array_cache *alien = NULL;
1030
1031 /*
1032 * Make sure we are not freeing a object from another node to the array
1033 * cache on this cpu.
1034 */
1035 if (likely(slabp->nodeid == numa_node_id()))
1036 return 0;
1037
1038 l3 = cachep->nodelists[numa_node_id()];
1039 STATS_INC_NODEFREES(cachep);
1040 if (l3->alien && l3->alien[nodeid]) {
1041 alien = l3->alien[nodeid];
1042 spin_lock(&alien->lock);
1043 if (unlikely(alien->avail == alien->limit)) {
1044 STATS_INC_ACOVERFLOW(cachep);
1045 __drain_alien_cache(cachep, alien, nodeid);
1046 }
1047 alien->entry[alien->avail++] = objp;
1048 spin_unlock(&alien->lock);
1049 } else {
1050 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1051 free_block(cachep, &objp, 1, nodeid);
1052 spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1053 }
1054 return 1;
1055}
1056
1027#else 1057#else
1028 1058
1029#define drain_alien_cache(cachep, alien) do { } while (0) 1059#define drain_alien_cache(cachep, alien) do { } while (0)
@@ -1038,9 +1068,14 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
1038{ 1068{
1039} 1069}
1040 1070
1071static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1072{
1073 return 0;
1074}
1075
1041#endif 1076#endif
1042 1077
1043static int cpuup_callback(struct notifier_block *nfb, 1078static int __devinit cpuup_callback(struct notifier_block *nfb,
1044 unsigned long action, void *hcpu) 1079 unsigned long action, void *hcpu)
1045{ 1080{
1046 long cpu = (long)hcpu; 1081 long cpu = (long)hcpu;
@@ -1207,10 +1242,7 @@ free_array_cache:
1207 l3 = cachep->nodelists[node]; 1242 l3 = cachep->nodelists[node];
1208 if (!l3) 1243 if (!l3)
1209 continue; 1244 continue;
1210 spin_lock_irq(&l3->list_lock); 1245 drain_freelist(cachep, l3, l3->free_objects);
1211 /* free slabs belonging to this node */
1212 __node_shrink(cachep, node);
1213 spin_unlock_irq(&l3->list_lock);
1214 } 1246 }
1215 mutex_unlock(&cache_chain_mutex); 1247 mutex_unlock(&cache_chain_mutex);
1216 break; 1248 break;
@@ -1222,7 +1254,9 @@ bad:
1222 return NOTIFY_BAD; 1254 return NOTIFY_BAD;
1223} 1255}
1224 1256
1225static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 1257static struct notifier_block __cpuinitdata cpucache_notifier = {
1258 &cpuup_callback, NULL, 0
1259};
1226 1260
1227/* 1261/*
1228 * swap the static kmem_list3 with kmalloced memory 1262 * swap the static kmem_list3 with kmalloced memory
@@ -1335,6 +1369,8 @@ void __init kmem_cache_init(void)
1335 NULL, NULL); 1369 NULL, NULL);
1336 } 1370 }
1337 1371
1372 slab_early_init = 0;
1373
1338 while (sizes->cs_size != ULONG_MAX) { 1374 while (sizes->cs_size != ULONG_MAX) {
1339 /* 1375 /*
1340 * For performance, all the general caches are L1 aligned. 1376 * For performance, all the general caches are L1 aligned.
@@ -1450,31 +1486,29 @@ __initcall(cpucache_init);
1450static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1486static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1451{ 1487{
1452 struct page *page; 1488 struct page *page;
1453 void *addr; 1489 int nr_pages;
1454 int i; 1490 int i;
1455 1491
1456 flags |= cachep->gfpflags;
1457#ifndef CONFIG_MMU 1492#ifndef CONFIG_MMU
1458 /* nommu uses slab's for process anonymous memory allocations, so 1493 /*
1459 * requires __GFP_COMP to properly refcount higher order allocations" 1494 * Nommu uses slab's for process anonymous memory allocations, and thus
1495 * requires __GFP_COMP to properly refcount higher order allocations
1460 */ 1496 */
1461 page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder); 1497 flags |= __GFP_COMP;
1462#else
1463 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1464#endif 1498#endif
1499 flags |= cachep->gfpflags;
1500
1501 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1465 if (!page) 1502 if (!page)
1466 return NULL; 1503 return NULL;
1467 addr = page_address(page);
1468 1504
1469 i = (1 << cachep->gfporder); 1505 nr_pages = (1 << cachep->gfporder);
1470 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1506 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1471 atomic_add(i, &slab_reclaim_pages); 1507 atomic_add(nr_pages, &slab_reclaim_pages);
1472 add_page_state(nr_slab, i); 1508 add_zone_page_state(page_zone(page), NR_SLAB, nr_pages);
1473 while (i--) { 1509 for (i = 0; i < nr_pages; i++)
1474 __SetPageSlab(page); 1510 __SetPageSlab(page + i);
1475 page++; 1511 return page_address(page);
1476 }
1477 return addr;
1478} 1512}
1479 1513
1480/* 1514/*
@@ -1486,12 +1520,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1486 struct page *page = virt_to_page(addr); 1520 struct page *page = virt_to_page(addr);
1487 const unsigned long nr_freed = i; 1521 const unsigned long nr_freed = i;
1488 1522
1523 sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed);
1489 while (i--) { 1524 while (i--) {
1490 BUG_ON(!PageSlab(page)); 1525 BUG_ON(!PageSlab(page));
1491 __ClearPageSlab(page); 1526 __ClearPageSlab(page);
1492 page++; 1527 page++;
1493 } 1528 }
1494 sub_page_state(nr_slab, nr_freed);
1495 if (current->reclaim_state) 1529 if (current->reclaim_state)
1496 current->reclaim_state->reclaimed_slab += nr_freed; 1530 current->reclaim_state->reclaimed_slab += nr_freed;
1497 free_pages((unsigned long)addr, cachep->gfporder); 1531 free_pages((unsigned long)addr, cachep->gfporder);
@@ -1913,8 +1947,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1913 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 1947 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1914{ 1948{
1915 size_t left_over, slab_size, ralign; 1949 size_t left_over, slab_size, ralign;
1916 struct kmem_cache *cachep = NULL; 1950 struct kmem_cache *cachep = NULL, *pc;
1917 struct list_head *p;
1918 1951
1919 /* 1952 /*
1920 * Sanity checks... these are all serious usage bugs. 1953 * Sanity checks... these are all serious usage bugs.
@@ -1934,8 +1967,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1934 1967
1935 mutex_lock(&cache_chain_mutex); 1968 mutex_lock(&cache_chain_mutex);
1936 1969
1937 list_for_each(p, &cache_chain) { 1970 list_for_each_entry(pc, &cache_chain, next) {
1938 struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1939 mm_segment_t old_fs = get_fs(); 1971 mm_segment_t old_fs = get_fs();
1940 char tmp; 1972 char tmp;
1941 int res; 1973 int res;
@@ -2069,8 +2101,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2069#endif 2101#endif
2070#endif 2102#endif
2071 2103
2072 /* Determine if the slab management is 'on' or 'off' slab. */ 2104 /*
2073 if (size >= (PAGE_SIZE >> 3)) 2105 * Determine if the slab management is 'on' or 'off' slab.
2106 * (bootstrapping cannot cope with offslab caches so don't do
2107 * it too early on.)
2108 */
2109 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2074 /* 2110 /*
2075 * Size is large, assume best to place the slab management obj 2111 * Size is large, assume best to place the slab management obj
2076 * off-slab (should allow better packing of objs). 2112 * off-slab (should allow better packing of objs).
@@ -2210,32 +2246,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2210 } 2246 }
2211} 2247}
2212 2248
2213static int __node_shrink(struct kmem_cache *cachep, int node) 2249/*
2250 * Remove slabs from the list of free slabs.
2251 * Specify the number of slabs to drain in tofree.
2252 *
2253 * Returns the actual number of slabs released.
2254 */
2255static int drain_freelist(struct kmem_cache *cache,
2256 struct kmem_list3 *l3, int tofree)
2214{ 2257{
2258 struct list_head *p;
2259 int nr_freed;
2215 struct slab *slabp; 2260 struct slab *slabp;
2216 struct kmem_list3 *l3 = cachep->nodelists[node];
2217 int ret;
2218 2261
2219 for (;;) { 2262 nr_freed = 0;
2220 struct list_head *p; 2263 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2221 2264
2265 spin_lock_irq(&l3->list_lock);
2222 p = l3->slabs_free.prev; 2266 p = l3->slabs_free.prev;
2223 if (p == &l3->slabs_free) 2267 if (p == &l3->slabs_free) {
2224 break; 2268 spin_unlock_irq(&l3->list_lock);
2269 goto out;
2270 }
2225 2271
2226 slabp = list_entry(l3->slabs_free.prev, struct slab, list); 2272 slabp = list_entry(p, struct slab, list);
2227#if DEBUG 2273#if DEBUG
2228 BUG_ON(slabp->inuse); 2274 BUG_ON(slabp->inuse);
2229#endif 2275#endif
2230 list_del(&slabp->list); 2276 list_del(&slabp->list);
2231 2277 /*
2232 l3->free_objects -= cachep->num; 2278 * Safe to drop the lock. The slab is no longer linked
2279 * to the cache.
2280 */
2281 l3->free_objects -= cache->num;
2233 spin_unlock_irq(&l3->list_lock); 2282 spin_unlock_irq(&l3->list_lock);
2234 slab_destroy(cachep, slabp); 2283 slab_destroy(cache, slabp);
2235 spin_lock_irq(&l3->list_lock); 2284 nr_freed++;
2236 } 2285 }
2237 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); 2286out:
2238 return ret; 2287 return nr_freed;
2239} 2288}
2240 2289
2241static int __cache_shrink(struct kmem_cache *cachep) 2290static int __cache_shrink(struct kmem_cache *cachep)
@@ -2248,11 +2297,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
2248 check_irq_on(); 2297 check_irq_on();
2249 for_each_online_node(i) { 2298 for_each_online_node(i) {
2250 l3 = cachep->nodelists[i]; 2299 l3 = cachep->nodelists[i];
2251 if (l3) { 2300 if (!l3)
2252 spin_lock_irq(&l3->list_lock); 2301 continue;
2253 ret += __node_shrink(cachep, i); 2302
2254 spin_unlock_irq(&l3->list_lock); 2303 drain_freelist(cachep, l3, l3->free_objects);
2255 } 2304
2305 ret += !list_empty(&l3->slabs_full) ||
2306 !list_empty(&l3->slabs_partial);
2256 } 2307 }
2257 return (ret ? 1 : 0); 2308 return (ret ? 1 : 0);
2258} 2309}
@@ -2460,23 +2511,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2460 slabp->inuse--; 2511 slabp->inuse--;
2461} 2512}
2462 2513
2463static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, 2514/*
2464 void *objp) 2515 * Map pages beginning at addr to the given cache and slab. This is required
2516 * for the slab allocator to be able to lookup the cache and slab of a
2517 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2518 */
2519static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2520 void *addr)
2465{ 2521{
2466 int i; 2522 int nr_pages;
2467 struct page *page; 2523 struct page *page;
2468 2524
2469 /* Nasty!!!!!! I hope this is OK. */ 2525 page = virt_to_page(addr);
2470 page = virt_to_page(objp);
2471 2526
2472 i = 1; 2527 nr_pages = 1;
2473 if (likely(!PageCompound(page))) 2528 if (likely(!PageCompound(page)))
2474 i <<= cachep->gfporder; 2529 nr_pages <<= cache->gfporder;
2530
2475 do { 2531 do {
2476 page_set_cache(page, cachep); 2532 page_set_cache(page, cache);
2477 page_set_slab(page, slabp); 2533 page_set_slab(page, slab);
2478 page++; 2534 page++;
2479 } while (--i); 2535 } while (--nr_pages);
2480} 2536}
2481 2537
2482/* 2538/*
@@ -2548,7 +2604,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2548 goto opps1; 2604 goto opps1;
2549 2605
2550 slabp->nodeid = nodeid; 2606 slabp->nodeid = nodeid;
2551 set_slab_attr(cachep, slabp, objp); 2607 slab_map_pages(cachep, slabp, objp);
2552 2608
2553 cache_init_objs(cachep, slabp, ctor_flags); 2609 cache_init_objs(cachep, slabp, ctor_flags);
2554 2610
@@ -2596,6 +2652,28 @@ static void kfree_debugcheck(const void *objp)
2596 } 2652 }
2597} 2653}
2598 2654
2655static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2656{
2657 unsigned long redzone1, redzone2;
2658
2659 redzone1 = *dbg_redzone1(cache, obj);
2660 redzone2 = *dbg_redzone2(cache, obj);
2661
2662 /*
2663 * Redzone is ok.
2664 */
2665 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2666 return;
2667
2668 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2669 slab_error(cache, "double free detected");
2670 else
2671 slab_error(cache, "memory outside object was overwritten");
2672
2673 printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2674 obj, redzone1, redzone2);
2675}
2676
2599static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2677static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2600 void *caller) 2678 void *caller)
2601{ 2679{
@@ -2607,27 +2685,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2607 kfree_debugcheck(objp); 2685 kfree_debugcheck(objp);
2608 page = virt_to_page(objp); 2686 page = virt_to_page(objp);
2609 2687
2610 if (page_get_cache(page) != cachep) {
2611 printk(KERN_ERR "mismatch in kmem_cache_free: expected "
2612 "cache %p, got %p\n",
2613 page_get_cache(page), cachep);
2614 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2615 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2616 page_get_cache(page)->name);
2617 WARN_ON(1);
2618 }
2619 slabp = page_get_slab(page); 2688 slabp = page_get_slab(page);
2620 2689
2621 if (cachep->flags & SLAB_RED_ZONE) { 2690 if (cachep->flags & SLAB_RED_ZONE) {
2622 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || 2691 verify_redzone_free(cachep, objp);
2623 *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2624 slab_error(cachep, "double free, or memory outside"
2625 " object was overwritten");
2626 printk(KERN_ERR "%p: redzone 1:0x%lx, "
2627 "redzone 2:0x%lx.\n",
2628 objp, *dbg_redzone1(cachep, objp),
2629 *dbg_redzone2(cachep, objp));
2630 }
2631 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2692 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2632 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2693 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2633 } 2694 }
@@ -3087,41 +3148,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3087 check_irq_off(); 3148 check_irq_off();
3088 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3149 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3089 3150
3090 /* Make sure we are not freeing a object from another 3151 if (cache_free_alien(cachep, objp))
3091 * node to the array cache on this cpu. 3152 return;
3092 */ 3153
3093#ifdef CONFIG_NUMA
3094 {
3095 struct slab *slabp;
3096 slabp = virt_to_slab(objp);
3097 if (unlikely(slabp->nodeid != numa_node_id())) {
3098 struct array_cache *alien = NULL;
3099 int nodeid = slabp->nodeid;
3100 struct kmem_list3 *l3;
3101
3102 l3 = cachep->nodelists[numa_node_id()];
3103 STATS_INC_NODEFREES(cachep);
3104 if (l3->alien && l3->alien[nodeid]) {
3105 alien = l3->alien[nodeid];
3106 spin_lock(&alien->lock);
3107 if (unlikely(alien->avail == alien->limit)) {
3108 STATS_INC_ACOVERFLOW(cachep);
3109 __drain_alien_cache(cachep,
3110 alien, nodeid);
3111 }
3112 alien->entry[alien->avail++] = objp;
3113 spin_unlock(&alien->lock);
3114 } else {
3115 spin_lock(&(cachep->nodelists[nodeid])->
3116 list_lock);
3117 free_block(cachep, &objp, 1, nodeid);
3118 spin_unlock(&(cachep->nodelists[nodeid])->
3119 list_lock);
3120 }
3121 return;
3122 }
3123 }
3124#endif
3125 if (likely(ac->avail < ac->limit)) { 3154 if (likely(ac->avail < ac->limit)) {
3126 STATS_INC_FREEHIT(cachep); 3155 STATS_INC_FREEHIT(cachep);
3127 ac->entry[ac->avail++] = objp; 3156 ac->entry[ac->avail++] = objp;
@@ -3254,26 +3283,10 @@ EXPORT_SYMBOL(kmalloc_node);
3254#endif 3283#endif
3255 3284
3256/** 3285/**
3257 * kmalloc - allocate memory 3286 * __do_kmalloc - allocate memory
3258 * @size: how many bytes of memory are required. 3287 * @size: how many bytes of memory are required.
3259 * @flags: the type of memory to allocate. 3288 * @flags: the type of memory to allocate (see kmalloc).
3260 * @caller: function caller for debug tracking of the caller 3289 * @caller: function caller for debug tracking of the caller
3261 *
3262 * kmalloc is the normal method of allocating memory
3263 * in the kernel.
3264 *
3265 * The @flags argument may be one of:
3266 *
3267 * %GFP_USER - Allocate memory on behalf of user. May sleep.
3268 *
3269 * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
3270 *
3271 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
3272 *
3273 * Additionally, the %GFP_DMA flag may be set to indicate the memory
3274 * must be suitable for DMA. This can mean different things on different
3275 * platforms. For example, on i386, it means that the memory must come
3276 * from the first 16MB.
3277 */ 3290 */
3278static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3291static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3279 void *caller) 3292 void *caller)
@@ -3371,6 +3384,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3371{ 3384{
3372 unsigned long flags; 3385 unsigned long flags;
3373 3386
3387 BUG_ON(virt_to_cache(objp) != cachep);
3388
3374 local_irq_save(flags); 3389 local_irq_save(flags);
3375 __cache_free(cachep, objp); 3390 __cache_free(cachep, objp);
3376 local_irq_restore(flags); 3391 local_irq_restore(flags);
@@ -3396,7 +3411,7 @@ void kfree(const void *objp)
3396 local_irq_save(flags); 3411 local_irq_save(flags);
3397 kfree_debugcheck(objp); 3412 kfree_debugcheck(objp);
3398 c = virt_to_cache(objp); 3413 c = virt_to_cache(objp);
3399 mutex_debug_check_no_locks_freed(objp, obj_size(c)); 3414 debug_check_no_locks_freed(objp, obj_size(c));
3400 __cache_free(c, (void *)objp); 3415 __cache_free(c, (void *)objp);
3401 local_irq_restore(flags); 3416 local_irq_restore(flags);
3402} 3417}
@@ -3680,7 +3695,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3680 */ 3695 */
3681static void cache_reap(void *unused) 3696static void cache_reap(void *unused)
3682{ 3697{
3683 struct list_head *walk; 3698 struct kmem_cache *searchp;
3684 struct kmem_list3 *l3; 3699 struct kmem_list3 *l3;
3685 int node = numa_node_id(); 3700 int node = numa_node_id();
3686 3701
@@ -3691,13 +3706,7 @@ static void cache_reap(void *unused)
3691 return; 3706 return;
3692 } 3707 }
3693 3708
3694 list_for_each(walk, &cache_chain) { 3709 list_for_each_entry(searchp, &cache_chain, next) {
3695 struct kmem_cache *searchp;
3696 struct list_head *p;
3697 int tofree;
3698 struct slab *slabp;
3699
3700 searchp = list_entry(walk, struct kmem_cache, next);
3701 check_irq_on(); 3710 check_irq_on();
3702 3711
3703 /* 3712 /*
@@ -3722,47 +3731,22 @@ static void cache_reap(void *unused)
3722 3731
3723 drain_array(searchp, l3, l3->shared, 0, node); 3732 drain_array(searchp, l3, l3->shared, 0, node);
3724 3733
3725 if (l3->free_touched) { 3734 if (l3->free_touched)
3726 l3->free_touched = 0; 3735 l3->free_touched = 0;
3727 goto next; 3736 else {
3728 } 3737 int freed;
3729 3738
3730 tofree = (l3->free_limit + 5 * searchp->num - 1) / 3739 freed = drain_freelist(searchp, l3, (l3->free_limit +
3731 (5 * searchp->num); 3740 5 * searchp->num - 1) / (5 * searchp->num));
3732 do { 3741 STATS_ADD_REAPED(searchp, freed);
3733 /* 3742 }
3734 * Do not lock if there are no free blocks.
3735 */
3736 if (list_empty(&l3->slabs_free))
3737 break;
3738
3739 spin_lock_irq(&l3->list_lock);
3740 p = l3->slabs_free.next;
3741 if (p == &(l3->slabs_free)) {
3742 spin_unlock_irq(&l3->list_lock);
3743 break;
3744 }
3745
3746 slabp = list_entry(p, struct slab, list);
3747 BUG_ON(slabp->inuse);
3748 list_del(&slabp->list);
3749 STATS_INC_REAPED(searchp);
3750
3751 /*
3752 * Safe to drop the lock. The slab is no longer linked
3753 * to the cache. searchp cannot disappear, we hold
3754 * cache_chain_lock
3755 */
3756 l3->free_objects -= searchp->num;
3757 spin_unlock_irq(&l3->list_lock);
3758 slab_destroy(searchp, slabp);
3759 } while (--tofree > 0);
3760next: 3743next:
3761 cond_resched(); 3744 cond_resched();
3762 } 3745 }
3763 check_irq_on(); 3746 check_irq_on();
3764 mutex_unlock(&cache_chain_mutex); 3747 mutex_unlock(&cache_chain_mutex);
3765 next_reap_node(); 3748 next_reap_node();
3749 refresh_cpu_vm_stats(smp_processor_id());
3766 /* Set up the next iteration */ 3750 /* Set up the next iteration */
3767 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3751 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3768} 3752}
@@ -3825,7 +3809,6 @@ static void s_stop(struct seq_file *m, void *p)
3825static int s_show(struct seq_file *m, void *p) 3809static int s_show(struct seq_file *m, void *p)
3826{ 3810{
3827 struct kmem_cache *cachep = p; 3811 struct kmem_cache *cachep = p;
3828 struct list_head *q;
3829 struct slab *slabp; 3812 struct slab *slabp;
3830 unsigned long active_objs; 3813 unsigned long active_objs;
3831 unsigned long num_objs; 3814 unsigned long num_objs;
@@ -3846,15 +3829,13 @@ static int s_show(struct seq_file *m, void *p)
3846 check_irq_on(); 3829 check_irq_on();
3847 spin_lock_irq(&l3->list_lock); 3830 spin_lock_irq(&l3->list_lock);
3848 3831
3849 list_for_each(q, &l3->slabs_full) { 3832 list_for_each_entry(slabp, &l3->slabs_full, list) {
3850 slabp = list_entry(q, struct slab, list);
3851 if (slabp->inuse != cachep->num && !error) 3833 if (slabp->inuse != cachep->num && !error)
3852 error = "slabs_full accounting error"; 3834 error = "slabs_full accounting error";
3853 active_objs += cachep->num; 3835 active_objs += cachep->num;
3854 active_slabs++; 3836 active_slabs++;
3855 } 3837 }
3856 list_for_each(q, &l3->slabs_partial) { 3838 list_for_each_entry(slabp, &l3->slabs_partial, list) {
3857 slabp = list_entry(q, struct slab, list);
3858 if (slabp->inuse == cachep->num && !error) 3839 if (slabp->inuse == cachep->num && !error)
3859 error = "slabs_partial inuse accounting error"; 3840 error = "slabs_partial inuse accounting error";
3860 if (!slabp->inuse && !error) 3841 if (!slabp->inuse && !error)
@@ -3862,8 +3843,7 @@ static int s_show(struct seq_file *m, void *p)
3862 active_objs += slabp->inuse; 3843 active_objs += slabp->inuse;
3863 active_slabs++; 3844 active_slabs++;
3864 } 3845 }
3865 list_for_each(q, &l3->slabs_free) { 3846 list_for_each_entry(slabp, &l3->slabs_free, list) {
3866 slabp = list_entry(q, struct slab, list);
3867 if (slabp->inuse && !error) 3847 if (slabp->inuse && !error)
3868 error = "slabs_free/inuse accounting error"; 3848 error = "slabs_free/inuse accounting error";
3869 num_slabs++; 3849 num_slabs++;
@@ -3956,7 +3936,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3956{ 3936{
3957 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 3937 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3958 int limit, batchcount, shared, res; 3938 int limit, batchcount, shared, res;
3959 struct list_head *p; 3939 struct kmem_cache *cachep;
3960 3940
3961 if (count > MAX_SLABINFO_WRITE) 3941 if (count > MAX_SLABINFO_WRITE)
3962 return -EINVAL; 3942 return -EINVAL;
@@ -3975,10 +3955,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3975 /* Find the cache in the chain of caches. */ 3955 /* Find the cache in the chain of caches. */
3976 mutex_lock(&cache_chain_mutex); 3956 mutex_lock(&cache_chain_mutex);
3977 res = -EINVAL; 3957 res = -EINVAL;
3978 list_for_each(p, &cache_chain) { 3958 list_for_each_entry(cachep, &cache_chain, next) {
3979 struct kmem_cache *cachep;
3980
3981 cachep = list_entry(p, struct kmem_cache, next);
3982 if (!strcmp(cachep->name, kbuf)) { 3959 if (!strcmp(cachep->name, kbuf)) {
3983 if (limit < 1 || batchcount < 1 || 3960 if (limit < 1 || batchcount < 1 ||
3984 batchcount > limit || shared < 0) { 3961 batchcount > limit || shared < 0) {
@@ -4080,7 +4057,6 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4080static int leaks_show(struct seq_file *m, void *p) 4057static int leaks_show(struct seq_file *m, void *p)
4081{ 4058{
4082 struct kmem_cache *cachep = p; 4059 struct kmem_cache *cachep = p;
4083 struct list_head *q;
4084 struct slab *slabp; 4060 struct slab *slabp;
4085 struct kmem_list3 *l3; 4061 struct kmem_list3 *l3;
4086 const char *name; 4062 const char *name;
@@ -4105,14 +4081,10 @@ static int leaks_show(struct seq_file *m, void *p)
4105 check_irq_on(); 4081 check_irq_on();
4106 spin_lock_irq(&l3->list_lock); 4082 spin_lock_irq(&l3->list_lock);
4107 4083
4108 list_for_each(q, &l3->slabs_full) { 4084 list_for_each_entry(slabp, &l3->slabs_full, list)
4109 slabp = list_entry(q, struct slab, list);
4110 handle_slab(n, cachep, slabp); 4085 handle_slab(n, cachep, slabp);
4111 } 4086 list_for_each_entry(slabp, &l3->slabs_partial, list)
4112 list_for_each(q, &l3->slabs_partial) {
4113 slabp = list_entry(q, struct slab, list);
4114 handle_slab(n, cachep, slabp); 4087 handle_slab(n, cachep, slabp);
4115 }
4116 spin_unlock_irq(&l3->list_lock); 4088 spin_unlock_irq(&l3->list_lock);
4117 } 4089 }
4118 name = cachep->name; 4090 name = cachep->name;
diff --git a/mm/slob.c b/mm/slob.c
index a68255ba4553..7b52b20b9607 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -29,7 +29,6 @@
29 * essentially no allocation space overhead. 29 * essentially no allocation space overhead.
30 */ 30 */
31 31
32#include <linux/config.h>
33#include <linux/slab.h> 32#include <linux/slab.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/cache.h> 34#include <linux/cache.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb6..86c52ab80878 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -1,7 +1,6 @@
1/* 1/*
2 * sparse memory mappings. 2 * sparse memory mappings.
3 */ 3 */
4#include <linux/config.h>
5#include <linux/mm.h> 4#include <linux/mm.h>
6#include <linux/mmzone.h> 5#include <linux/mmzone.h>
7#include <linux/bootmem.h> 6#include <linux/bootmem.h>
@@ -45,7 +44,7 @@ static struct mem_section *sparse_index_alloc(int nid)
45 44
46static int sparse_index_init(unsigned long section_nr, int nid) 45static int sparse_index_init(unsigned long section_nr, int nid)
47{ 46{
48 static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; 47 static DEFINE_SPINLOCK(index_init_lock);
49 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 48 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
50 struct mem_section *section; 49 struct mem_section *section;
51 int ret = 0; 50 int ret = 0;
@@ -99,6 +98,22 @@ int __section_nr(struct mem_section* ms)
99 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 98 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
100} 99}
101 100
101/*
102 * During early boot, before section_mem_map is used for an actual
103 * mem_map, we use section_mem_map to store the section's NUMA
104 * node. This keeps us from having to use another data structure. The
105 * node information is cleared just before we store the real mem_map.
106 */
107static inline unsigned long sparse_encode_early_nid(int nid)
108{
109 return (nid << SECTION_NID_SHIFT);
110}
111
112static inline int sparse_early_nid(struct mem_section *section)
113{
114 return (section->section_mem_map >> SECTION_NID_SHIFT);
115}
116
102/* Record a memory area against a node. */ 117/* Record a memory area against a node. */
103void memory_present(int nid, unsigned long start, unsigned long end) 118void memory_present(int nid, unsigned long start, unsigned long end)
104{ 119{
@@ -113,7 +128,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
113 128
114 ms = __nr_to_section(section); 129 ms = __nr_to_section(section);
115 if (!ms->section_mem_map) 130 if (!ms->section_mem_map)
116 ms->section_mem_map = SECTION_MARKED_PRESENT; 131 ms->section_mem_map = sparse_encode_early_nid(nid) |
132 SECTION_MARKED_PRESENT;
117 } 133 }
118} 134}
119 135
@@ -164,6 +180,7 @@ static int sparse_init_one_section(struct mem_section *ms,
164 if (!valid_section(ms)) 180 if (!valid_section(ms))
165 return -EINVAL; 181 return -EINVAL;
166 182
183 ms->section_mem_map &= ~SECTION_MAP_MASK;
167 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); 184 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
168 185
169 return 1; 186 return 1;
@@ -172,8 +189,8 @@ static int sparse_init_one_section(struct mem_section *ms,
172static struct page *sparse_early_mem_map_alloc(unsigned long pnum) 189static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
173{ 190{
174 struct page *map; 191 struct page *map;
175 int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
176 struct mem_section *ms = __nr_to_section(pnum); 192 struct mem_section *ms = __nr_to_section(pnum);
193 int nid = sparse_early_nid(ms);
177 194
178 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 195 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
179 if (map) 196 if (map)
diff --git a/mm/swap.c b/mm/swap.c
index 88895c249bc9..8fd095c4ae51 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -86,9 +86,8 @@ int rotate_reclaimable_page(struct page *page)
86 zone = page_zone(page); 86 zone = page_zone(page);
87 spin_lock_irqsave(&zone->lru_lock, flags); 87 spin_lock_irqsave(&zone->lru_lock, flags);
88 if (PageLRU(page) && !PageActive(page)) { 88 if (PageLRU(page) && !PageActive(page)) {
89 list_del(&page->lru); 89 list_move_tail(&page->lru, &zone->inactive_list);
90 list_add_tail(&page->lru, &zone->inactive_list); 90 __count_vm_event(PGROTATED);
91 inc_page_state(pgrotated);
92 } 91 }
93 if (!test_clear_page_writeback(page)) 92 if (!test_clear_page_writeback(page))
94 BUG(); 93 BUG();
@@ -108,7 +107,7 @@ void fastcall activate_page(struct page *page)
108 del_page_from_inactive_list(zone, page); 107 del_page_from_inactive_list(zone, page);
109 SetPageActive(page); 108 SetPageActive(page);
110 add_page_to_active_list(zone, page); 109 add_page_to_active_list(zone, page);
111 inc_page_state(pgactivate); 110 __count_vm_event(PGACTIVATE);
112 } 111 }
113 spin_unlock_irq(&zone->lru_lock); 112 spin_unlock_irq(&zone->lru_lock);
114} 113}
@@ -480,48 +479,6 @@ static int cpu_swap_callback(struct notifier_block *nfb,
480#endif /* CONFIG_HOTPLUG_CPU */ 479#endif /* CONFIG_HOTPLUG_CPU */
481#endif /* CONFIG_SMP */ 480#endif /* CONFIG_SMP */
482 481
483#ifdef CONFIG_SMP
484void percpu_counter_mod(struct percpu_counter *fbc, long amount)
485{
486 long count;
487 long *pcount;
488 int cpu = get_cpu();
489
490 pcount = per_cpu_ptr(fbc->counters, cpu);
491 count = *pcount + amount;
492 if (count >= FBC_BATCH || count <= -FBC_BATCH) {
493 spin_lock(&fbc->lock);
494 fbc->count += count;
495 *pcount = 0;
496 spin_unlock(&fbc->lock);
497 } else {
498 *pcount = count;
499 }
500 put_cpu();
501}
502EXPORT_SYMBOL(percpu_counter_mod);
503
504/*
505 * Add up all the per-cpu counts, return the result. This is a more accurate
506 * but much slower version of percpu_counter_read_positive()
507 */
508long percpu_counter_sum(struct percpu_counter *fbc)
509{
510 long ret;
511 int cpu;
512
513 spin_lock(&fbc->lock);
514 ret = fbc->count;
515 for_each_possible_cpu(cpu) {
516 long *pcount = per_cpu_ptr(fbc->counters, cpu);
517 ret += *pcount;
518 }
519 spin_unlock(&fbc->lock);
520 return ret < 0 ? 0 : ret;
521}
522EXPORT_SYMBOL(percpu_counter_sum);
523#endif
524
525/* 482/*
526 * Perform any setup for the swap system 483 * Perform any setup for the swap system
527 */ 484 */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0e1583f32c2..fccbd9bba77b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,7 +24,7 @@
24 * vmscan's shrink_list, to make sync_page look nicer, and to allow 24 * vmscan's shrink_list, to make sync_page look nicer, and to allow
25 * future use of radix_tree tags in the swap cache. 25 * future use of radix_tree tags in the swap cache.
26 */ 26 */
27static struct address_space_operations swap_aops = { 27static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 28 .writepage = swap_writepage,
29 .sync_page = block_sync_page, 29 .sync_page = block_sync_page,
30 .set_page_dirty = __set_page_dirty_nobuffers, 30 .set_page_dirty = __set_page_dirty_nobuffers,
@@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
87 SetPageSwapCache(page); 87 SetPageSwapCache(page);
88 set_page_private(page, entry.val); 88 set_page_private(page, entry.val);
89 total_swapcache_pages++; 89 total_swapcache_pages++;
90 pagecache_acct(1); 90 __inc_zone_page_state(page, NR_FILE_PAGES);
91 } 91 }
92 write_unlock_irq(&swapper_space.tree_lock); 92 write_unlock_irq(&swapper_space.tree_lock);
93 radix_tree_preload_end(); 93 radix_tree_preload_end();
@@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page)
132 set_page_private(page, 0); 132 set_page_private(page, 0);
133 ClearPageSwapCache(page); 133 ClearPageSwapCache(page);
134 total_swapcache_pages--; 134 total_swapcache_pages--;
135 pagecache_acct(-1); 135 __dec_zone_page_state(page, NR_FILE_PAGES);
136 INC_CACHE_INFO(del_total); 136 INC_CACHE_INFO(del_total);
137} 137}
138 138
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5fd5385f0cc..e70d6c6d6fee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -5,7 +5,6 @@
5 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */ 6 */
7 7
8#include <linux/config.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/hugetlb.h> 9#include <linux/hugetlb.h>
11#include <linux/mman.h> 10#include <linux/mman.h>
@@ -395,6 +394,9 @@ void free_swap_and_cache(swp_entry_t entry)
395 struct swap_info_struct * p; 394 struct swap_info_struct * p;
396 struct page *page = NULL; 395 struct page *page = NULL;
397 396
397 if (is_migration_entry(entry))
398 return;
399
398 p = swap_info_get(entry); 400 p = swap_info_get(entry);
399 if (p) { 401 if (p) {
400 if (swap_entry_free(p, swp_offset(entry)) == 1) { 402 if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -615,15 +617,6 @@ static int unuse_mm(struct mm_struct *mm,
615 return 0; 617 return 0;
616} 618}
617 619
618#ifdef CONFIG_MIGRATION
619int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
620{
621 swp_entry_t entry = { .val = page_private(page) };
622
623 return unuse_vma(vma, entry, page);
624}
625#endif
626
627/* 620/*
628 * Scan swap_map from current position to next entry still in use. 621 * Scan swap_map from current position to next entry still in use.
629 * Recycle to start on reaching the end, returning 0 when empty. 622 * Recycle to start on reaching the end, returning 0 when empty.
@@ -716,7 +709,6 @@ static int try_to_unuse(unsigned int type)
716 */ 709 */
717 swap_map = &si->swap_map[i]; 710 swap_map = &si->swap_map[i];
718 entry = swp_entry(type, i); 711 entry = swp_entry(type, i);
719again:
720 page = read_swap_cache_async(entry, NULL, 0); 712 page = read_swap_cache_async(entry, NULL, 0);
721 if (!page) { 713 if (!page) {
722 /* 714 /*
@@ -751,12 +743,6 @@ again:
751 wait_on_page_locked(page); 743 wait_on_page_locked(page);
752 wait_on_page_writeback(page); 744 wait_on_page_writeback(page);
753 lock_page(page); 745 lock_page(page);
754 if (!PageSwapCache(page)) {
755 /* Page migration has occured */
756 unlock_page(page);
757 page_cache_release(page);
758 goto again;
759 }
760 wait_on_page_writeback(page); 746 wait_on_page_writeback(page);
761 747
762 /* 748 /*
@@ -785,10 +771,8 @@ again:
785 while (*swap_map > 1 && !retval && 771 while (*swap_map > 1 && !retval &&
786 (p = p->next) != &start_mm->mmlist) { 772 (p = p->next) != &start_mm->mmlist) {
787 mm = list_entry(p, struct mm_struct, mmlist); 773 mm = list_entry(p, struct mm_struct, mmlist);
788 if (atomic_inc_return(&mm->mm_users) == 1) { 774 if (!atomic_inc_not_zero(&mm->mm_users))
789 atomic_dec(&mm->mm_users);
790 continue; 775 continue;
791 }
792 spin_unlock(&mmlist_lock); 776 spin_unlock(&mmlist_lock);
793 mmput(prev_mm); 777 mmput(prev_mm);
794 prev_mm = mm; 778 prev_mm = mm;
@@ -1407,19 +1391,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1407 if (!(p->flags & SWP_USED)) 1391 if (!(p->flags & SWP_USED))
1408 break; 1392 break;
1409 error = -EPERM; 1393 error = -EPERM;
1410 /* 1394 if (type >= MAX_SWAPFILES) {
1411 * Test if adding another swap device is possible. There are
1412 * two limiting factors: 1) the number of bits for the swap
1413 * type swp_entry_t definition and 2) the number of bits for
1414 * the swap type in the swap ptes as defined by the different
1415 * architectures. To honor both limitations a swap entry
1416 * with swap offset 0 and swap type ~0UL is created, encoded
1417 * to a swap pte, decoded to a swp_entry_t again and finally
1418 * the swap type part is extracted. This will mask all bits
1419 * from the initial ~0UL that can't be encoded in either the
1420 * swp_entry_t or the architecture definition of a swap pte.
1421 */
1422 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
1423 spin_unlock(&swap_lock); 1395 spin_unlock(&swap_lock);
1424 goto out; 1396 goto out;
1425 } 1397 }
@@ -1504,8 +1476,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1504 error = -EINVAL; 1476 error = -EINVAL;
1505 goto bad_swap; 1477 goto bad_swap;
1506 } 1478 }
1507 page = read_cache_page(mapping, 0, 1479 page = read_mapping_page(mapping, 0, swap_file);
1508 (filler_t *)mapping->a_ops->readpage, swap_file);
1509 if (IS_ERR(page)) { 1480 if (IS_ERR(page)) {
1510 error = PTR_ERR(page); 1481 error = PTR_ERR(page);
1511 goto bad_swap; 1482 goto bad_swap;
@@ -1709,6 +1680,9 @@ int swap_duplicate(swp_entry_t entry)
1709 unsigned long offset, type; 1680 unsigned long offset, type;
1710 int result = 0; 1681 int result = 0;
1711 1682
1683 if (is_migration_entry(entry))
1684 return 1;
1685
1712 type = swp_type(entry); 1686 type = swp_type(entry);
1713 if (type >= nr_swapfiles) 1687 if (type >= nr_swapfiles)
1714 goto bad_file; 1688 goto bad_file;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index f9d6a9cc91c4..5f2cbf0f153c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -12,7 +12,6 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/devfs_fs_kernel.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/mount.h> 16#include <linux/mount.h>
18#include <linux/file.h> 17#include <linux/file.h>
@@ -33,9 +32,6 @@ static int __init init_tmpfs(void)
33{ 32{
34 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 33 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
35 34
36#ifdef CONFIG_TMPFS
37 devfs_mk_dir("shm");
38#endif
39 shm_mnt = kern_mount(&tmpfs_fs_type); 35 shm_mnt = kern_mount(&tmpfs_fs_type);
40 BUG_ON(IS_ERR(shm_mnt)); 36 BUG_ON(IS_ERR(shm_mnt));
41 37
diff --git a/mm/truncate.c b/mm/truncate.c
index 6cb3fff25f67..cf1b015df4a7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
230 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 230 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
231 for (i = 0; i < pagevec_count(&pvec); i++) { 231 for (i = 0; i < pagevec_count(&pvec); i++) {
232 struct page *page = pvec.pages[i]; 232 struct page *page = pvec.pages[i];
233 pgoff_t index;
234 int lock_failed;
233 235
234 if (TestSetPageLocked(page)) { 236 lock_failed = TestSetPageLocked(page);
235 next++; 237
236 continue; 238 /*
237 } 239 * We really shouldn't be looking at the ->index of an
238 if (page->index > next) 240 * unlocked page. But we're not allowed to lock these
239 next = page->index; 241 * pages. So we rely upon nobody altering the ->index
242 * of this (pinned-by-us) page.
243 */
244 index = page->index;
245 if (index > next)
246 next = index;
240 next++; 247 next++;
248 if (lock_failed)
249 continue;
250
241 if (PageDirty(page) || PageWriteback(page)) 251 if (PageDirty(page) || PageWriteback(page))
242 goto unlock; 252 goto unlock;
243 if (page_mapped(page)) 253 if (page_mapped(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34eb..35f8553f893a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
257} 257}
258 258
259/* Caller must hold vmlist_lock */ 259/* Caller must hold vmlist_lock */
260static struct vm_struct *__find_vm_area(void *addr)
261{
262 struct vm_struct *tmp;
263
264 for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
265 if (tmp->addr == addr)
266 break;
267 }
268
269 return tmp;
270}
271
272/* Caller must hold vmlist_lock */
260struct vm_struct *__remove_vm_area(void *addr) 273struct vm_struct *__remove_vm_area(void *addr)
261{ 274{
262 struct vm_struct **p, *tmp; 275 struct vm_struct **p, *tmp;
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc);
498 */ 511 */
499void *vmalloc(unsigned long size) 512void *vmalloc(unsigned long size)
500{ 513{
501 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 514 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
502} 515}
503EXPORT_SYMBOL(vmalloc); 516EXPORT_SYMBOL(vmalloc);
504 517
505/** 518/**
519 * vmalloc_user - allocate virtually contiguous memory which has
520 * been zeroed so it can be mapped to userspace without
521 * leaking data.
522 *
523 * @size: allocation size
524 */
525void *vmalloc_user(unsigned long size)
526{
527 struct vm_struct *area;
528 void *ret;
529
530 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
531 write_lock(&vmlist_lock);
532 area = __find_vm_area(ret);
533 area->flags |= VM_USERMAP;
534 write_unlock(&vmlist_lock);
535
536 return ret;
537}
538EXPORT_SYMBOL(vmalloc_user);
539
540/**
506 * vmalloc_node - allocate memory on a specific node 541 * vmalloc_node - allocate memory on a specific node
507 * 542 *
508 * @size: allocation size 543 * @size: allocation size
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
516 */ 551 */
517void *vmalloc_node(unsigned long size, int node) 552void *vmalloc_node(unsigned long size, int node)
518{ 553{
519 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); 554 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
520} 555}
521EXPORT_SYMBOL(vmalloc_node); 556EXPORT_SYMBOL(vmalloc_node);
522 557
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
556} 591}
557EXPORT_SYMBOL(vmalloc_32); 592EXPORT_SYMBOL(vmalloc_32);
558 593
594/**
595 * vmalloc_32_user - allocate virtually contiguous memory (32bit
596 * addressable) which is zeroed so it can be
597 * mapped to userspace without leaking data.
598 *
599 * @size: allocation size
600 */
601void *vmalloc_32_user(unsigned long size)
602{
603 struct vm_struct *area;
604 void *ret;
605
606 ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
607 write_lock(&vmlist_lock);
608 area = __find_vm_area(ret);
609 area->flags |= VM_USERMAP;
610 write_unlock(&vmlist_lock);
611
612 return ret;
613}
614EXPORT_SYMBOL(vmalloc_32_user);
615
559long vread(char *buf, char *addr, unsigned long count) 616long vread(char *buf, char *addr, unsigned long count)
560{ 617{
561 struct vm_struct *tmp; 618 struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
630 read_unlock(&vmlist_lock); 687 read_unlock(&vmlist_lock);
631 return buf - buf_start; 688 return buf - buf_start;
632} 689}
690
691/**
692 * remap_vmalloc_range - map vmalloc pages to userspace
693 *
694 * @vma: vma to cover (map full range of vma)
695 * @addr: vmalloc memory
696 * @pgoff: number of pages into addr before first page to map
697 * @returns: 0 for success, -Exxx on failure
698 *
699 * This function checks that addr is a valid vmalloc'ed area, and
700 * that it is big enough to cover the vma. Will return failure if
701 * that criteria isn't met.
702 *
703 * Similar to remap_pfn_range (see mm/memory.c)
704 */
705int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
706 unsigned long pgoff)
707{
708 struct vm_struct *area;
709 unsigned long uaddr = vma->vm_start;
710 unsigned long usize = vma->vm_end - vma->vm_start;
711 int ret;
712
713 if ((PAGE_SIZE-1) & (unsigned long)addr)
714 return -EINVAL;
715
716 read_lock(&vmlist_lock);
717 area = __find_vm_area(addr);
718 if (!area)
719 goto out_einval_locked;
720
721 if (!(area->flags & VM_USERMAP))
722 goto out_einval_locked;
723
724 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
725 goto out_einval_locked;
726 read_unlock(&vmlist_lock);
727
728 addr += pgoff << PAGE_SHIFT;
729 do {
730 struct page *page = vmalloc_to_page(addr);
731 ret = vm_insert_page(vma, uaddr, page);
732 if (ret)
733 return ret;
734
735 uaddr += PAGE_SIZE;
736 addr += PAGE_SIZE;
737 usize -= PAGE_SIZE;
738 } while (usize > 0);
739
740 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
741 vma->vm_flags |= VM_RESERVED;
742
743 return ret;
744
745out_einval_locked:
746 read_unlock(&vmlist_lock);
747 return -EINVAL;
748}
749EXPORT_SYMBOL(remap_vmalloc_range);
750
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e9..ff2ebe9458a3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,6 +34,7 @@
34#include <linux/notifier.h> 34#include <linux/notifier.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/kthread.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39#include <asm/div64.h> 40#include <asm/div64.h>
@@ -46,8 +47,6 @@ struct scan_control {
46 /* Incremented by the number of inactive pages that were scanned */ 47 /* Incremented by the number of inactive pages that were scanned */
47 unsigned long nr_scanned; 48 unsigned long nr_scanned;
48 49
49 unsigned long nr_mapped; /* From page_state */
50
51 /* This context's GFP mask */ 50 /* This context's GFP mask */
52 gfp_t gfp_mask; 51 gfp_t gfp_mask;
53 52
@@ -61,6 +60,8 @@ struct scan_control {
61 * In this context, it doesn't matter that we scan the 60 * In this context, it doesn't matter that we scan the
62 * whole list at once. */ 61 * whole list at once. */
63 int swap_cluster_max; 62 int swap_cluster_max;
63
64 int swappiness;
64}; 65};
65 66
66/* 67/*
@@ -108,7 +109,7 @@ struct shrinker {
108 * From 0 .. 100. Higher means more swappy. 109 * From 0 .. 100. Higher means more swappy.
109 */ 110 */
110int vm_swappiness = 60; 111int vm_swappiness = 60;
111static long total_memory; 112long vm_total_pages; /* The total number of pages which the VM controls */
112 113
113static LIST_HEAD(shrinker_list); 114static LIST_HEAD(shrinker_list);
114static DECLARE_RWSEM(shrinker_rwsem); 115static DECLARE_RWSEM(shrinker_rwsem);
@@ -214,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
214 break; 215 break;
215 if (shrink_ret < nr_before) 216 if (shrink_ret < nr_before)
216 ret += nr_before - shrink_ret; 217 ret += nr_before - shrink_ret;
217 mod_page_state(slabs_scanned, this_scan); 218 count_vm_events(SLABS_SCANNED, this_scan);
218 total_scan -= this_scan; 219 total_scan -= this_scan;
219 220
220 cond_resched(); 221 cond_resched();
@@ -288,11 +289,23 @@ static void handle_write_error(struct address_space *mapping,
288 unlock_page(page); 289 unlock_page(page);
289} 290}
290 291
292/* possible outcome of pageout() */
293typedef enum {
294 /* failed to write page out, page is locked */
295 PAGE_KEEP,
296 /* move page to the active list, page is locked */
297 PAGE_ACTIVATE,
298 /* page has been sent to the disk successfully, page is unlocked */
299 PAGE_SUCCESS,
300 /* page is clean and locked */
301 PAGE_CLEAN,
302} pageout_t;
303
291/* 304/*
292 * pageout is called by shrink_page_list() for each dirty page. 305 * pageout is called by shrink_page_list() for each dirty page.
293 * Calls ->writepage(). 306 * Calls ->writepage().
294 */ 307 */
295pageout_t pageout(struct page *page, struct address_space *mapping) 308static pageout_t pageout(struct page *page, struct address_space *mapping)
296{ 309{
297 /* 310 /*
298 * If the page is dirty, only perform writeback if that write 311 * If the page is dirty, only perform writeback if that write
@@ -337,6 +350,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
337 struct writeback_control wbc = { 350 struct writeback_control wbc = {
338 .sync_mode = WB_SYNC_NONE, 351 .sync_mode = WB_SYNC_NONE,
339 .nr_to_write = SWAP_CLUSTER_MAX, 352 .nr_to_write = SWAP_CLUSTER_MAX,
353 .range_start = 0,
354 .range_end = LLONG_MAX,
340 .nonblocking = 1, 355 .nonblocking = 1,
341 .for_reclaim = 1, 356 .for_reclaim = 1,
342 }; 357 };
@@ -554,7 +569,7 @@ keep:
554 list_splice(&ret_pages, page_list); 569 list_splice(&ret_pages, page_list);
555 if (pagevec_count(&freed_pvec)) 570 if (pagevec_count(&freed_pvec))
556 __pagevec_release_nonlru(&freed_pvec); 571 __pagevec_release_nonlru(&freed_pvec);
557 mod_page_state(pgactivate, pgactivate); 572 count_vm_events(PGACTIVATE, pgactivate);
558 return nr_reclaimed; 573 return nr_reclaimed;
559} 574}
560 575
@@ -644,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
644 nr_reclaimed += nr_freed; 659 nr_reclaimed += nr_freed;
645 local_irq_disable(); 660 local_irq_disable();
646 if (current_is_kswapd()) { 661 if (current_is_kswapd()) {
647 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 662 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
648 __mod_page_state(kswapd_steal, nr_freed); 663 __count_vm_events(KSWAPD_STEAL, nr_freed);
649 } else 664 } else
650 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 665 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
651 __mod_page_state_zone(zone, pgsteal, nr_freed); 666 __count_vm_events(PGACTIVATE, nr_freed);
652 667
653 if (nr_taken == 0) 668 if (nr_taken == 0)
654 goto done; 669 goto done;
@@ -727,7 +742,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
727 * how much memory 742 * how much memory
728 * is mapped. 743 * is mapped.
729 */ 744 */
730 mapped_ratio = (sc->nr_mapped * 100) / total_memory; 745 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
746 global_page_state(NR_ANON_PAGES)) * 100) /
747 vm_total_pages;
731 748
732 /* 749 /*
733 * Now decide how much we really want to unmap some pages. The 750 * Now decide how much we really want to unmap some pages. The
@@ -741,7 +758,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
741 * A 100% value of vm_swappiness overrides this algorithm 758 * A 100% value of vm_swappiness overrides this algorithm
742 * altogether. 759 * altogether.
743 */ 760 */
744 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; 761 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
745 762
746 /* 763 /*
747 * Now use this metric to decide whether to start moving mapped 764 * Now use this metric to decide whether to start moving mapped
@@ -824,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
824 } 841 }
825 } 842 }
826 zone->nr_active += pgmoved; 843 zone->nr_active += pgmoved;
827 spin_unlock(&zone->lru_lock);
828 844
829 __mod_page_state_zone(zone, pgrefill, pgscanned); 845 __count_zone_vm_events(PGREFILL, zone, pgscanned);
830 __mod_page_state(pgdeactivate, pgdeactivate); 846 __count_vm_events(PGDEACTIVATE, pgdeactivate);
831 local_irq_enable(); 847 spin_unlock_irq(&zone->lru_lock);
832 848
833 pagevec_release(&pvec); 849 pagevec_release(&pvec);
834} 850}
@@ -957,9 +973,10 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
957 .may_writepage = !laptop_mode, 973 .may_writepage = !laptop_mode,
958 .swap_cluster_max = SWAP_CLUSTER_MAX, 974 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1, 975 .may_swap = 1,
976 .swappiness = vm_swappiness,
960 }; 977 };
961 978
962 inc_page_state(allocstall); 979 count_vm_event(ALLOCSTALL);
963 980
964 for (i = 0; zones[i] != NULL; i++) { 981 for (i = 0; zones[i] != NULL; i++) {
965 struct zone *zone = zones[i]; 982 struct zone *zone = zones[i];
@@ -972,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
972 } 989 }
973 990
974 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 991 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
975 sc.nr_mapped = read_page_state(nr_mapped);
976 sc.nr_scanned = 0; 992 sc.nr_scanned = 0;
977 if (!priority) 993 if (!priority)
978 disable_swap_token(); 994 disable_swap_token();
@@ -1021,10 +1037,6 @@ out:
1021 * For kswapd, balance_pgdat() will work across all this node's zones until 1037 * For kswapd, balance_pgdat() will work across all this node's zones until
1022 * they are all at pages_high. 1038 * they are all at pages_high.
1023 * 1039 *
1024 * If `nr_pages' is non-zero then it is the number of pages which are to be
1025 * reclaimed, regardless of the zone occupancies. This is a software suspend
1026 * special.
1027 *
1028 * Returns the number of pages which were actually freed. 1040 * Returns the number of pages which were actually freed.
1029 * 1041 *
1030 * There is special handling here for zones which are full of pinned pages. 1042 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1054,8 @@ out:
1042 * the page allocator fallback scheme to ensure that aging of pages is balanced 1054 * the page allocator fallback scheme to ensure that aging of pages is balanced
1043 * across the zones. 1055 * across the zones.
1044 */ 1056 */
1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, 1057static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1046 int order)
1047{ 1058{
1048 unsigned long to_free = nr_pages;
1049 int all_zones_ok; 1059 int all_zones_ok;
1050 int priority; 1060 int priority;
1051 int i; 1061 int i;
@@ -1055,16 +1065,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1055 struct scan_control sc = { 1065 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL, 1066 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1, 1067 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, 1068 .swap_cluster_max = SWAP_CLUSTER_MAX,
1069 .swappiness = vm_swappiness,
1059 }; 1070 };
1060 1071
1061loop_again: 1072loop_again:
1062 total_scanned = 0; 1073 total_scanned = 0;
1063 nr_reclaimed = 0; 1074 nr_reclaimed = 0;
1064 sc.may_writepage = !laptop_mode; 1075 sc.may_writepage = !laptop_mode;
1065 sc.nr_mapped = read_page_state(nr_mapped); 1076 count_vm_event(PAGEOUTRUN);
1066
1067 inc_page_state(pageoutrun);
1068 1077
1069 for (i = 0; i < pgdat->nr_zones; i++) { 1078 for (i = 0; i < pgdat->nr_zones; i++) {
1070 struct zone *zone = pgdat->node_zones + i; 1079 struct zone *zone = pgdat->node_zones + i;
@@ -1082,31 +1091,26 @@ loop_again:
1082 1091
1083 all_zones_ok = 1; 1092 all_zones_ok = 1;
1084 1093
1085 if (nr_pages == 0) { 1094 /*
1086 /* 1095 * Scan in the highmem->dma direction for the highest
1087 * Scan in the highmem->dma direction for the highest 1096 * zone which needs scanning
1088 * zone which needs scanning 1097 */
1089 */ 1098 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1090 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1099 struct zone *zone = pgdat->node_zones + i;
1091 struct zone *zone = pgdat->node_zones + i;
1092 1100
1093 if (!populated_zone(zone)) 1101 if (!populated_zone(zone))
1094 continue; 1102 continue;
1095 1103
1096 if (zone->all_unreclaimable && 1104 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1097 priority != DEF_PRIORITY) 1105 continue;
1098 continue;
1099 1106
1100 if (!zone_watermark_ok(zone, order, 1107 if (!zone_watermark_ok(zone, order, zone->pages_high,
1101 zone->pages_high, 0, 0)) { 1108 0, 0)) {
1102 end_zone = i; 1109 end_zone = i;
1103 goto scan; 1110 goto scan;
1104 }
1105 } 1111 }
1106 goto out;
1107 } else {
1108 end_zone = pgdat->nr_zones - 1;
1109 } 1112 }
1113 goto out;
1110scan: 1114scan:
1111 for (i = 0; i <= end_zone; i++) { 1115 for (i = 0; i <= end_zone; i++) {
1112 struct zone *zone = pgdat->node_zones + i; 1116 struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1137,9 @@ scan:
1133 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1137 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1134 continue; 1138 continue;
1135 1139
1136 if (nr_pages == 0) { /* Not software suspend */ 1140 if (!zone_watermark_ok(zone, order, zone->pages_high,
1137 if (!zone_watermark_ok(zone, order, 1141 end_zone, 0))
1138 zone->pages_high, end_zone, 0)) 1142 all_zones_ok = 0;
1139 all_zones_ok = 0;
1140 }
1141 zone->temp_priority = priority; 1143 zone->temp_priority = priority;
1142 if (zone->prev_priority > priority) 1144 if (zone->prev_priority > priority)
1143 zone->prev_priority = priority; 1145 zone->prev_priority = priority;
@@ -1162,8 +1164,6 @@ scan:
1162 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1164 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1163 sc.may_writepage = 1; 1165 sc.may_writepage = 1;
1164 } 1166 }
1165 if (nr_pages && to_free > nr_reclaimed)
1166 continue; /* swsusp: need to do more work */
1167 if (all_zones_ok) 1167 if (all_zones_ok)
1168 break; /* kswapd: all done */ 1168 break; /* kswapd: all done */
1169 /* 1169 /*
@@ -1179,7 +1179,7 @@ scan:
1179 * matches the direct reclaim path behaviour in terms of impact 1179 * matches the direct reclaim path behaviour in terms of impact
1180 * on zone->*_priority. 1180 * on zone->*_priority.
1181 */ 1181 */
1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) 1182 if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1183 break; 1183 break;
1184 } 1184 }
1185out: 1185out:
@@ -1220,7 +1220,6 @@ static int kswapd(void *p)
1220 }; 1220 };
1221 cpumask_t cpumask; 1221 cpumask_t cpumask;
1222 1222
1223 daemonize("kswapd%d", pgdat->node_id);
1224 cpumask = node_to_cpumask(pgdat->node_id); 1223 cpumask = node_to_cpumask(pgdat->node_id);
1225 if (!cpus_empty(cpumask)) 1224 if (!cpus_empty(cpumask))
1226 set_cpus_allowed(tsk, cpumask); 1225 set_cpus_allowed(tsk, cpumask);
@@ -1261,7 +1260,7 @@ static int kswapd(void *p)
1261 } 1260 }
1262 finish_wait(&pgdat->kswapd_wait, &wait); 1261 finish_wait(&pgdat->kswapd_wait, &wait);
1263 1262
1264 balance_pgdat(pgdat, 0, order); 1263 balance_pgdat(pgdat, order);
1265 } 1264 }
1266 return 0; 1265 return 0;
1267} 1266}
@@ -1290,35 +1289,152 @@ void wakeup_kswapd(struct zone *zone, int order)
1290 1289
1291#ifdef CONFIG_PM 1290#ifdef CONFIG_PM
1292/* 1291/*
1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1292 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
1294 * pages. 1293 * from LRU lists system-wide, for given pass and priority, and returns the
1294 * number of reclaimed pages
1295 *
1296 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
1297 */
1298static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1299 int prio, struct scan_control *sc)
1300{
1301 struct zone *zone;
1302 unsigned long nr_to_scan, ret = 0;
1303
1304 for_each_zone(zone) {
1305
1306 if (!populated_zone(zone))
1307 continue;
1308
1309 if (zone->all_unreclaimable && prio != DEF_PRIORITY)
1310 continue;
1311
1312 /* For pass = 0 we don't shrink the active list */
1313 if (pass > 0) {
1314 zone->nr_scan_active += (zone->nr_active >> prio) + 1;
1315 if (zone->nr_scan_active >= nr_pages || pass > 3) {
1316 zone->nr_scan_active = 0;
1317 nr_to_scan = min(nr_pages, zone->nr_active);
1318 shrink_active_list(nr_to_scan, zone, sc);
1319 }
1320 }
1321
1322 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
1323 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1324 zone->nr_scan_inactive = 0;
1325 nr_to_scan = min(nr_pages, zone->nr_inactive);
1326 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1327 if (ret >= nr_pages)
1328 return ret;
1329 }
1330 }
1331
1332 return ret;
1333}
1334
1335/*
1336 * Try to free `nr_pages' of memory, system-wide, and return the number of
1337 * freed pages.
1338 *
1339 * Rather than trying to age LRUs the aim is to preserve the overall
1340 * LRU order by reclaiming preferentially
1341 * inactive > active > active referenced > active mapped
1295 */ 1342 */
1296unsigned long shrink_all_memory(unsigned long nr_pages) 1343unsigned long shrink_all_memory(unsigned long nr_pages)
1297{ 1344{
1298 pg_data_t *pgdat; 1345 unsigned long lru_pages, nr_slab;
1299 unsigned long nr_to_free = nr_pages;
1300 unsigned long ret = 0; 1346 unsigned long ret = 0;
1301 unsigned retry = 2; 1347 int pass;
1302 struct reclaim_state reclaim_state = { 1348 struct reclaim_state reclaim_state;
1303 .reclaimed_slab = 0, 1349 struct zone *zone;
1350 struct scan_control sc = {
1351 .gfp_mask = GFP_KERNEL,
1352 .may_swap = 0,
1353 .swap_cluster_max = nr_pages,
1354 .may_writepage = 1,
1355 .swappiness = vm_swappiness,
1304 }; 1356 };
1305 1357
1306 current->reclaim_state = &reclaim_state; 1358 current->reclaim_state = &reclaim_state;
1307repeat: 1359
1308 for_each_online_pgdat(pgdat) { 1360 lru_pages = 0;
1309 unsigned long freed; 1361 for_each_zone(zone)
1310 1362 lru_pages += zone->nr_active + zone->nr_inactive;
1311 freed = balance_pgdat(pgdat, nr_to_free, 0); 1363
1312 ret += freed; 1364 nr_slab = global_page_state(NR_SLAB);
1313 nr_to_free -= freed; 1365 /* If slab caches are huge, it's better to hit them first */
1314 if ((long)nr_to_free <= 0) 1366 while (nr_slab >= lru_pages) {
1367 reclaim_state.reclaimed_slab = 0;
1368 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1369 if (!reclaim_state.reclaimed_slab)
1315 break; 1370 break;
1371
1372 ret += reclaim_state.reclaimed_slab;
1373 if (ret >= nr_pages)
1374 goto out;
1375
1376 nr_slab -= reclaim_state.reclaimed_slab;
1316 } 1377 }
1317 if (retry-- && ret < nr_pages) { 1378
1318 blk_congestion_wait(WRITE, HZ/5); 1379 /*
1319 goto repeat; 1380 * We try to shrink LRUs in 5 passes:
1381 * 0 = Reclaim from inactive_list only
1382 * 1 = Reclaim from active list but don't reclaim mapped
1383 * 2 = 2nd pass of type 1
1384 * 3 = Reclaim mapped (normal reclaim)
1385 * 4 = 2nd pass of type 3
1386 */
1387 for (pass = 0; pass < 5; pass++) {
1388 int prio;
1389
1390 /* Needed for shrinking slab caches later on */
1391 if (!lru_pages)
1392 for_each_zone(zone) {
1393 lru_pages += zone->nr_active;
1394 lru_pages += zone->nr_inactive;
1395 }
1396
1397 /* Force reclaiming mapped pages in the passes #3 and #4 */
1398 if (pass > 2) {
1399 sc.may_swap = 1;
1400 sc.swappiness = 100;
1401 }
1402
1403 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
1404 unsigned long nr_to_scan = nr_pages - ret;
1405
1406 sc.nr_scanned = 0;
1407 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
1408 if (ret >= nr_pages)
1409 goto out;
1410
1411 reclaim_state.reclaimed_slab = 0;
1412 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
1413 ret += reclaim_state.reclaimed_slab;
1414 if (ret >= nr_pages)
1415 goto out;
1416
1417 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1418 blk_congestion_wait(WRITE, HZ / 10);
1419 }
1420
1421 lru_pages = 0;
1320 } 1422 }
1423
1424 /*
1425 * If ret = 0, we could not shrink LRUs, but there may be something
1426 * in slab caches
1427 */
1428 if (!ret)
1429 do {
1430 reclaim_state.reclaimed_slab = 0;
1431 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1432 ret += reclaim_state.reclaimed_slab;
1433 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1434
1435out:
1321 current->reclaim_state = NULL; 1436 current->reclaim_state = NULL;
1437
1322 return ret; 1438 return ret;
1323} 1439}
1324#endif 1440#endif
@@ -1328,7 +1444,7 @@ repeat:
1328 not required for correctness. So if the last cpu in a node goes 1444 not required for correctness. So if the last cpu in a node goes
1329 away, we get changed to run anywhere: as the first one comes back, 1445 away, we get changed to run anywhere: as the first one comes back,
1330 restore their cpu bindings. */ 1446 restore their cpu bindings. */
1331static int cpu_callback(struct notifier_block *nfb, 1447static int __devinit cpu_callback(struct notifier_block *nfb,
1332 unsigned long action, void *hcpu) 1448 unsigned long action, void *hcpu)
1333{ 1449{
1334 pg_data_t *pgdat; 1450 pg_data_t *pgdat;
@@ -1346,21 +1462,35 @@ static int cpu_callback(struct notifier_block *nfb,
1346} 1462}
1347#endif /* CONFIG_HOTPLUG_CPU */ 1463#endif /* CONFIG_HOTPLUG_CPU */
1348 1464
1465/*
1466 * This kswapd start function will be called by init and node-hot-add.
1467 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
1468 */
1469int kswapd_run(int nid)
1470{
1471 pg_data_t *pgdat = NODE_DATA(nid);
1472 int ret = 0;
1473
1474 if (pgdat->kswapd)
1475 return 0;
1476
1477 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
1478 if (IS_ERR(pgdat->kswapd)) {
1479 /* failure at boot is fatal */
1480 BUG_ON(system_state == SYSTEM_BOOTING);
1481 printk("Failed to start kswapd on node %d\n",nid);
1482 ret = -1;
1483 }
1484 return ret;
1485}
1486
1349static int __init kswapd_init(void) 1487static int __init kswapd_init(void)
1350{ 1488{
1351 pg_data_t *pgdat; 1489 int nid;
1352 1490
1353 swap_setup(); 1491 swap_setup();
1354 for_each_online_pgdat(pgdat) { 1492 for_each_online_node(nid)
1355 pid_t pid; 1493 kswapd_run(nid);
1356
1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1358 BUG_ON(pid < 0);
1359 read_lock(&tasklist_lock);
1360 pgdat->kswapd = find_task_by_pid(pid);
1361 read_unlock(&tasklist_lock);
1362 }
1363 total_memory = nr_free_pagecache_pages();
1364 hotcpu_notifier(cpu_callback, 0); 1494 hotcpu_notifier(cpu_callback, 0);
1365 return 0; 1495 return 0;
1366} 1496}
@@ -1387,11 +1517,6 @@ int zone_reclaim_mode __read_mostly;
1387#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ 1517#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1388 1518
1389/* 1519/*
1390 * Mininum time between zone reclaim scans
1391 */
1392int zone_reclaim_interval __read_mostly = 30*HZ;
1393
1394/*
1395 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1520 * Priority for ZONE_RECLAIM. This determines the fraction of pages
1396 * of a node considered for each zone_reclaim. 4 scans 1/16th of 1521 * of a node considered for each zone_reclaim. 4 scans 1/16th of
1397 * a zone. 1522 * a zone.
@@ -1412,10 +1537,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1412 struct scan_control sc = { 1537 struct scan_control sc = {
1413 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1538 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1414 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1539 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1415 .nr_mapped = read_page_state(nr_mapped),
1416 .swap_cluster_max = max_t(unsigned long, nr_pages, 1540 .swap_cluster_max = max_t(unsigned long, nr_pages,
1417 SWAP_CLUSTER_MAX), 1541 SWAP_CLUSTER_MAX),
1418 .gfp_mask = gfp_mask, 1542 .gfp_mask = gfp_mask,
1543 .swappiness = vm_swappiness,
1419 }; 1544 };
1420 1545
1421 disable_swap_token(); 1546 disable_swap_token();
@@ -1456,16 +1581,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1456 1581
1457 p->reclaim_state = NULL; 1582 p->reclaim_state = NULL;
1458 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1583 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1459
1460 if (nr_reclaimed == 0) {
1461 /*
1462 * We were unable to reclaim enough pages to stay on node. We
1463 * now allow off node accesses for a certain time period before
1464 * trying again to reclaim pages from the local zone.
1465 */
1466 zone->last_unsuccessful_zone_reclaim = jiffies;
1467 }
1468
1469 return nr_reclaimed >= nr_pages; 1584 return nr_reclaimed >= nr_pages;
1470} 1585}
1471 1586
@@ -1475,13 +1590,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1475 int node_id; 1590 int node_id;
1476 1591
1477 /* 1592 /*
1478 * Do not reclaim if there was a recent unsuccessful attempt at zone 1593 * Do not reclaim if there are not enough reclaimable pages in this
1479 * reclaim. In that case we let allocations go off node for the 1594 * zone that would satify this allocations.
1480 * zone_reclaim_interval. Otherwise we would scan for each off-node 1595 *
1481 * page allocation. 1596 * All unmapped pagecache pages are reclaimable.
1597 *
1598 * Both counters may be temporarily off a bit so we use
1599 * SWAP_CLUSTER_MAX as the boundary. It may also be good to
1600 * leave a few frequently used unmapped pagecache pages around.
1482 */ 1601 */
1483 if (time_before(jiffies, 1602 if (zone_page_state(zone, NR_FILE_PAGES) -
1484 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1603 zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
1485 return 0; 1604 return 0;
1486 1605
1487 /* 1606 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 000000000000..73b83d67bab6
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,614 @@
1/*
2 * linux/mm/vmstat.c
3 *
4 * Manages VM statistics
5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 *
7 * zoned VM statistics
8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com>
10 */
11
12#include <linux/config.h>
13#include <linux/mm.h>
14#include <linux/module.h>
15
16void __get_zone_counts(unsigned long *active, unsigned long *inactive,
17 unsigned long *free, struct pglist_data *pgdat)
18{
19 struct zone *zones = pgdat->node_zones;
20 int i;
21
22 *active = 0;
23 *inactive = 0;
24 *free = 0;
25 for (i = 0; i < MAX_NR_ZONES; i++) {
26 *active += zones[i].nr_active;
27 *inactive += zones[i].nr_inactive;
28 *free += zones[i].free_pages;
29 }
30}
31
32void get_zone_counts(unsigned long *active,
33 unsigned long *inactive, unsigned long *free)
34{
35 struct pglist_data *pgdat;
36
37 *active = 0;
38 *inactive = 0;
39 *free = 0;
40 for_each_online_pgdat(pgdat) {
41 unsigned long l, m, n;
42 __get_zone_counts(&l, &m, &n, pgdat);
43 *active += l;
44 *inactive += m;
45 *free += n;
46 }
47}
48
49#ifdef CONFIG_VM_EVENT_COUNTERS
50DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
51EXPORT_PER_CPU_SYMBOL(vm_event_states);
52
53static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
54{
55 int cpu = 0;
56 int i;
57
58 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
59
60 cpu = first_cpu(*cpumask);
61 while (cpu < NR_CPUS) {
62 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
63
64 cpu = next_cpu(cpu, *cpumask);
65
66 if (cpu < NR_CPUS)
67 prefetch(&per_cpu(vm_event_states, cpu));
68
69
70 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
71 ret[i] += this->event[i];
72 }
73}
74
75/*
76 * Accumulate the vm event counters across all CPUs.
77 * The result is unavoidably approximate - it can change
78 * during and after execution of this function.
79*/
80void all_vm_events(unsigned long *ret)
81{
82 sum_vm_events(ret, &cpu_online_map);
83}
84
85#ifdef CONFIG_HOTPLUG
86/*
87 * Fold the foreign cpu events into our own.
88 *
89 * This is adding to the events on one processor
90 * but keeps the global counts constant.
91 */
92void vm_events_fold_cpu(int cpu)
93{
94 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
95 int i;
96
97 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
98 count_vm_events(i, fold_state->event[i]);
99 fold_state->event[i] = 0;
100 }
101}
102#endif /* CONFIG_HOTPLUG */
103
104#endif /* CONFIG_VM_EVENT_COUNTERS */
105
106/*
107 * Manage combined zone based / global counters
108 *
109 * vm_stat contains the global counters
110 */
111atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
112EXPORT_SYMBOL(vm_stat);
113
114#ifdef CONFIG_SMP
115
116#define STAT_THRESHOLD 32
117
118/*
119 * Determine pointer to currently valid differential byte given a zone and
120 * the item number.
121 *
122 * Preemption must be off
123 */
124static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
125{
126 return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
127}
128
129/*
130 * For use when we know that interrupts are disabled.
131 */
132void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
133 int delta)
134{
135 s8 *p;
136 long x;
137
138 p = diff_pointer(zone, item);
139 x = delta + *p;
140
141 if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
142 zone_page_state_add(x, zone, item);
143 x = 0;
144 }
145
146 *p = x;
147}
148EXPORT_SYMBOL(__mod_zone_page_state);
149
150/*
151 * For an unknown interrupt state
152 */
153void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
154 int delta)
155{
156 unsigned long flags;
157
158 local_irq_save(flags);
159 __mod_zone_page_state(zone, item, delta);
160 local_irq_restore(flags);
161}
162EXPORT_SYMBOL(mod_zone_page_state);
163
164/*
165 * Optimized increment and decrement functions.
166 *
167 * These are only for a single page and therefore can take a struct page *
168 * argument instead of struct zone *. This allows the inclusion of the code
169 * generated for page_zone(page) into the optimized functions.
170 *
171 * No overflow check is necessary and therefore the differential can be
172 * incremented or decremented in place which may allow the compilers to
173 * generate better code.
174 *
175 * The increment or decrement is known and therefore one boundary check can
176 * be omitted.
177 *
178 * Some processors have inc/dec instructions that are atomic vs an interrupt.
179 * However, the code must first determine the differential location in a zone
180 * based on the processor number and then inc/dec the counter. There is no
181 * guarantee without disabling preemption that the processor will not change
182 * in between and therefore the atomicity vs. interrupt cannot be exploited
183 * in a useful way here.
184 */
185static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
186{
187 s8 *p = diff_pointer(zone, item);
188
189 (*p)++;
190
191 if (unlikely(*p > STAT_THRESHOLD)) {
192 zone_page_state_add(*p, zone, item);
193 *p = 0;
194 }
195}
196
197void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
198{
199 __inc_zone_state(page_zone(page), item);
200}
201EXPORT_SYMBOL(__inc_zone_page_state);
202
203void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
204{
205 struct zone *zone = page_zone(page);
206 s8 *p = diff_pointer(zone, item);
207
208 (*p)--;
209
210 if (unlikely(*p < -STAT_THRESHOLD)) {
211 zone_page_state_add(*p, zone, item);
212 *p = 0;
213 }
214}
215EXPORT_SYMBOL(__dec_zone_page_state);
216
217void inc_zone_state(struct zone *zone, enum zone_stat_item item)
218{
219 unsigned long flags;
220
221 local_irq_save(flags);
222 __inc_zone_state(zone, item);
223 local_irq_restore(flags);
224}
225
226void inc_zone_page_state(struct page *page, enum zone_stat_item item)
227{
228 unsigned long flags;
229 struct zone *zone;
230
231 zone = page_zone(page);
232 local_irq_save(flags);
233 __inc_zone_state(zone, item);
234 local_irq_restore(flags);
235}
236EXPORT_SYMBOL(inc_zone_page_state);
237
238void dec_zone_page_state(struct page *page, enum zone_stat_item item)
239{
240 unsigned long flags;
241 struct zone *zone;
242 s8 *p;
243
244 zone = page_zone(page);
245 local_irq_save(flags);
246 p = diff_pointer(zone, item);
247
248 (*p)--;
249
250 if (unlikely(*p < -STAT_THRESHOLD)) {
251 zone_page_state_add(*p, zone, item);
252 *p = 0;
253 }
254 local_irq_restore(flags);
255}
256EXPORT_SYMBOL(dec_zone_page_state);
257
258/*
259 * Update the zone counters for one cpu.
260 */
261void refresh_cpu_vm_stats(int cpu)
262{
263 struct zone *zone;
264 int i;
265 unsigned long flags;
266
267 for_each_zone(zone) {
268 struct per_cpu_pageset *pcp;
269
270 pcp = zone_pcp(zone, cpu);
271
272 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
273 if (pcp->vm_stat_diff[i]) {
274 local_irq_save(flags);
275 zone_page_state_add(pcp->vm_stat_diff[i],
276 zone, i);
277 pcp->vm_stat_diff[i] = 0;
278 local_irq_restore(flags);
279 }
280 }
281}
282
283static void __refresh_cpu_vm_stats(void *dummy)
284{
285 refresh_cpu_vm_stats(smp_processor_id());
286}
287
288/*
289 * Consolidate all counters.
290 *
291 * Note that the result is less inaccurate but still inaccurate
292 * if concurrent processes are allowed to run.
293 */
294void refresh_vm_stats(void)
295{
296 on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
297}
298EXPORT_SYMBOL(refresh_vm_stats);
299
300#endif
301
302#ifdef CONFIG_NUMA
303/*
304 * zonelist = the list of zones passed to the allocator
305 * z = the zone from which the allocation occurred.
306 *
307 * Must be called with interrupts disabled.
308 */
309void zone_statistics(struct zonelist *zonelist, struct zone *z)
310{
311 if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
312 __inc_zone_state(z, NUMA_HIT);
313 } else {
314 __inc_zone_state(z, NUMA_MISS);
315 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
316 }
317 if (z->zone_pgdat == NODE_DATA(numa_node_id()))
318 __inc_zone_state(z, NUMA_LOCAL);
319 else
320 __inc_zone_state(z, NUMA_OTHER);
321}
322#endif
323
324#ifdef CONFIG_PROC_FS
325
326#include <linux/seq_file.h>
327
328static void *frag_start(struct seq_file *m, loff_t *pos)
329{
330 pg_data_t *pgdat;
331 loff_t node = *pos;
332 for (pgdat = first_online_pgdat();
333 pgdat && node;
334 pgdat = next_online_pgdat(pgdat))
335 --node;
336
337 return pgdat;
338}
339
340static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
341{
342 pg_data_t *pgdat = (pg_data_t *)arg;
343
344 (*pos)++;
345 return next_online_pgdat(pgdat);
346}
347
348static void frag_stop(struct seq_file *m, void *arg)
349{
350}
351
352/*
353 * This walks the free areas for each zone.
354 */
355static int frag_show(struct seq_file *m, void *arg)
356{
357 pg_data_t *pgdat = (pg_data_t *)arg;
358 struct zone *zone;
359 struct zone *node_zones = pgdat->node_zones;
360 unsigned long flags;
361 int order;
362
363 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
364 if (!populated_zone(zone))
365 continue;
366
367 spin_lock_irqsave(&zone->lock, flags);
368 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
369 for (order = 0; order < MAX_ORDER; ++order)
370 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
371 spin_unlock_irqrestore(&zone->lock, flags);
372 seq_putc(m, '\n');
373 }
374 return 0;
375}
376
377struct seq_operations fragmentation_op = {
378 .start = frag_start,
379 .next = frag_next,
380 .stop = frag_stop,
381 .show = frag_show,
382};
383
384static char *vmstat_text[] = {
385 /* Zoned VM counters */
386 "nr_anon_pages",
387 "nr_mapped",
388 "nr_file_pages",
389 "nr_slab",
390 "nr_page_table_pages",
391 "nr_dirty",
392 "nr_writeback",
393 "nr_unstable",
394 "nr_bounce",
395
396#ifdef CONFIG_NUMA
397 "numa_hit",
398 "numa_miss",
399 "numa_foreign",
400 "numa_interleave",
401 "numa_local",
402 "numa_other",
403#endif
404
405#ifdef CONFIG_VM_EVENT_COUNTERS
406 "pgpgin",
407 "pgpgout",
408 "pswpin",
409 "pswpout",
410
411 "pgalloc_dma",
412 "pgalloc_dma32",
413 "pgalloc_normal",
414 "pgalloc_high",
415
416 "pgfree",
417 "pgactivate",
418 "pgdeactivate",
419
420 "pgfault",
421 "pgmajfault",
422
423 "pgrefill_dma",
424 "pgrefill_dma32",
425 "pgrefill_normal",
426 "pgrefill_high",
427
428 "pgsteal_dma",
429 "pgsteal_dma32",
430 "pgsteal_normal",
431 "pgsteal_high",
432
433 "pgscan_kswapd_dma",
434 "pgscan_kswapd_dma32",
435 "pgscan_kswapd_normal",
436 "pgscan_kswapd_high",
437
438 "pgscan_direct_dma",
439 "pgscan_direct_dma32",
440 "pgscan_direct_normal",
441 "pgscan_direct_high",
442
443 "pginodesteal",
444 "slabs_scanned",
445 "kswapd_steal",
446 "kswapd_inodesteal",
447 "pageoutrun",
448 "allocstall",
449
450 "pgrotated",
451#endif
452};
453
454/*
455 * Output information about zones in @pgdat.
456 */
457static int zoneinfo_show(struct seq_file *m, void *arg)
458{
459 pg_data_t *pgdat = arg;
460 struct zone *zone;
461 struct zone *node_zones = pgdat->node_zones;
462 unsigned long flags;
463
464 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
465 int i;
466
467 if (!populated_zone(zone))
468 continue;
469
470 spin_lock_irqsave(&zone->lock, flags);
471 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
472 seq_printf(m,
473 "\n pages free %lu"
474 "\n min %lu"
475 "\n low %lu"
476 "\n high %lu"
477 "\n active %lu"
478 "\n inactive %lu"
479 "\n scanned %lu (a: %lu i: %lu)"
480 "\n spanned %lu"
481 "\n present %lu",
482 zone->free_pages,
483 zone->pages_min,
484 zone->pages_low,
485 zone->pages_high,
486 zone->nr_active,
487 zone->nr_inactive,
488 zone->pages_scanned,
489 zone->nr_scan_active, zone->nr_scan_inactive,
490 zone->spanned_pages,
491 zone->present_pages);
492
493 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
494 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
495 zone_page_state(zone, i));
496
497 seq_printf(m,
498 "\n protection: (%lu",
499 zone->lowmem_reserve[0]);
500 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
501 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
502 seq_printf(m,
503 ")"
504 "\n pagesets");
505 for_each_online_cpu(i) {
506 struct per_cpu_pageset *pageset;
507 int j;
508
509 pageset = zone_pcp(zone, i);
510 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
511 if (pageset->pcp[j].count)
512 break;
513 }
514 if (j == ARRAY_SIZE(pageset->pcp))
515 continue;
516 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
517 seq_printf(m,
518 "\n cpu: %i pcp: %i"
519 "\n count: %i"
520 "\n high: %i"
521 "\n batch: %i",
522 i, j,
523 pageset->pcp[j].count,
524 pageset->pcp[j].high,
525 pageset->pcp[j].batch);
526 }
527 }
528 seq_printf(m,
529 "\n all_unreclaimable: %u"
530 "\n prev_priority: %i"
531 "\n temp_priority: %i"
532 "\n start_pfn: %lu",
533 zone->all_unreclaimable,
534 zone->prev_priority,
535 zone->temp_priority,
536 zone->zone_start_pfn);
537 spin_unlock_irqrestore(&zone->lock, flags);
538 seq_putc(m, '\n');
539 }
540 return 0;
541}
542
543struct seq_operations zoneinfo_op = {
544 .start = frag_start, /* iterate over all zones. The same as in
545 * fragmentation. */
546 .next = frag_next,
547 .stop = frag_stop,
548 .show = zoneinfo_show,
549};
550
551static void *vmstat_start(struct seq_file *m, loff_t *pos)
552{
553 unsigned long *v;
554#ifdef CONFIG_VM_EVENT_COUNTERS
555 unsigned long *e;
556#endif
557 int i;
558
559 if (*pos >= ARRAY_SIZE(vmstat_text))
560 return NULL;
561
562#ifdef CONFIG_VM_EVENT_COUNTERS
563 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
564 + sizeof(struct vm_event_state), GFP_KERNEL);
565#else
566 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
567 GFP_KERNEL);
568#endif
569 m->private = v;
570 if (!v)
571 return ERR_PTR(-ENOMEM);
572 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
573 v[i] = global_page_state(i);
574#ifdef CONFIG_VM_EVENT_COUNTERS
575 e = v + NR_VM_ZONE_STAT_ITEMS;
576 all_vm_events(e);
577 e[PGPGIN] /= 2; /* sectors -> kbytes */
578 e[PGPGOUT] /= 2;
579#endif
580 return v + *pos;
581}
582
583static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
584{
585 (*pos)++;
586 if (*pos >= ARRAY_SIZE(vmstat_text))
587 return NULL;
588 return (unsigned long *)m->private + *pos;
589}
590
591static int vmstat_show(struct seq_file *m, void *arg)
592{
593 unsigned long *l = arg;
594 unsigned long off = l - (unsigned long *)m->private;
595
596 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
597 return 0;
598}
599
600static void vmstat_stop(struct seq_file *m, void *arg)
601{
602 kfree(m->private);
603 m->private = NULL;
604}
605
606struct seq_operations vmstat_op = {
607 .start = vmstat_start,
608 .next = vmstat_next,
609 .stop = vmstat_stop,
610 .show = vmstat_show,
611};
612
613#endif /* CONFIG_PROC_FS */
614