aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2006-06-24 08:41:41 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2006-06-24 13:07:53 -0400
commit816724e65c72a90a44fbad0ef0b59b186c85fa90 (patch)
tree421fa29aedff988e392f92780637553e275d37a0 /mm
parent70ac4385a13f78bc478f26d317511893741b05bd (diff)
parentd384ea691fe4ea8c2dd5b9b8d9042eb181776f18 (diff)
Merge branch 'master' of /home/trondmy/kernel/linux-2.6/
Conflicts: fs/nfs/inode.c fs/super.c Fix conflicts between patch 'NFS: Split fs/nfs/inode.c' and patch 'VFS: Permit filesystem to override root dentry on mount'
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/filemap.c183
-rw-r--r--mm/filemap.h6
-rw-r--r--mm/fremap.c9
-rw-r--r--mm/hugetlb.c282
-rw-r--r--mm/memory.c125
-rw-r--r--mm/memory_hotplug.c27
-rw-r--r--mm/mempolicy.c36
-rw-r--r--mm/migrate.c1058
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c37
-rw-r--r--mm/msync.c3
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c184
-rw-r--r--mm/pdflush.c3
-rw-r--r--mm/rmap.c107
-rw-r--r--mm/shmem.c18
-rw-r--r--mm/slab.c249
-rw-r--r--mm/sparse.c22
-rw-r--r--mm/swap.c42
-rw-r--r--mm/swapfile.c43
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/vmalloc.c122
-rw-r--r--mm/vmscan.c240
25 files changed, 1816 insertions, 1030 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 332f5c29b53a..66e65ab39426 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
138# 138#
139config MIGRATION 139config MIGRATION
140 bool "Page migration" 140 bool "Page migration"
141 def_bool y if NUMA 141 def_bool y
142 depends on SWAP && NUMA 142 depends on NUMA
143 help 143 help
144 Allows the migration of the physical location of pages of processes 144 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for 145 while the virtual addresses are not changed. This is useful for
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186cb..807a463fd5ed 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/uaccess.h>
17#include <linux/aio.h> 18#include <linux/aio.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
19#include <linux/kernel_stat.h> 20#include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
38 */ 39 */
39#include <linux/buffer_head.h> /* for generic_osync_inode */ 40#include <linux/buffer_head.h> /* for generic_osync_inode */
40 41
41#include <asm/uaccess.h>
42#include <asm/mman.h> 42#include <asm/mman.h>
43 43
44static ssize_t 44static ssize_t
@@ -171,15 +171,17 @@ static int sync_page(void *word)
171} 171}
172 172
173/** 173/**
174 * filemap_fdatawrite_range - start writeback against all of a mapping's 174 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
175 * dirty pages that lie within the byte offsets <start, end>
176 * @mapping: address space structure to write 175 * @mapping: address space structure to write
177 * @start: offset in bytes where the range starts 176 * @start: offset in bytes where the range starts
178 * @end: offset in bytes where the range ends (inclusive) 177 * @end: offset in bytes where the range ends (inclusive)
179 * @sync_mode: enable synchronous operation 178 * @sync_mode: enable synchronous operation
180 * 179 *
180 * Start writeback against all of a mapping's dirty pages that lie
181 * within the byte offsets <start, end> inclusive.
182 *
181 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 183 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
182 * opposed to a regular memory * cleansing writeback. The difference between 184 * opposed to a regular memory cleansing writeback. The difference between
183 * these two operations is that if a dirty page/buffer is encountered, it must 185 * these two operations is that if a dirty page/buffer is encountered, it must
184 * be waited upon, and not just skipped over. 186 * be waited upon, and not just skipped over.
185 */ 187 */
@@ -190,8 +192,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
190 struct writeback_control wbc = { 192 struct writeback_control wbc = {
191 .sync_mode = sync_mode, 193 .sync_mode = sync_mode,
192 .nr_to_write = mapping->nrpages * 2, 194 .nr_to_write = mapping->nrpages * 2,
193 .start = start, 195 .range_start = start,
194 .end = end, 196 .range_end = end,
195 }; 197 };
196 198
197 if (!mapping_cap_writeback_dirty(mapping)) 199 if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +206,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
204static inline int __filemap_fdatawrite(struct address_space *mapping, 206static inline int __filemap_fdatawrite(struct address_space *mapping,
205 int sync_mode) 207 int sync_mode)
206{ 208{
207 return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); 209 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
208} 210}
209 211
210int filemap_fdatawrite(struct address_space *mapping) 212int filemap_fdatawrite(struct address_space *mapping)
@@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 221 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
220} 222}
221 223
222/* 224/**
225 * filemap_flush - mostly a non-blocking flush
226 * @mapping: target address_space
227 *
223 * This is a mostly non-blocking flush. Not suitable for data-integrity 228 * This is a mostly non-blocking flush. Not suitable for data-integrity
224 * purposes - I/O may not be started against all dirty pages. 229 * purposes - I/O may not be started against all dirty pages.
225 */ 230 */
@@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping)
229} 234}
230EXPORT_SYMBOL(filemap_flush); 235EXPORT_SYMBOL(filemap_flush);
231 236
232/* 237/**
238 * wait_on_page_writeback_range - wait for writeback to complete
239 * @mapping: target address_space
240 * @start: beginning page index
241 * @end: ending page index
242 *
233 * Wait for writeback to complete against pages indexed by start->end 243 * Wait for writeback to complete against pages indexed by start->end
234 * inclusive 244 * inclusive
235 */ 245 */
@@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping,
276 return ret; 286 return ret;
277} 287}
278 288
279/* 289/**
290 * sync_page_range - write and wait on all pages in the passed range
291 * @inode: target inode
292 * @mapping: target address_space
293 * @pos: beginning offset in pages to write
294 * @count: number of bytes to write
295 *
280 * Write and wait upon all the pages in the passed range. This is a "data 296 * Write and wait upon all the pages in the passed range. This is a "data
281 * integrity" operation. It waits upon in-flight writeout before starting and 297 * integrity" operation. It waits upon in-flight writeout before starting and
282 * waiting upon new writeout. If there was an IO error, return it. 298 * waiting upon new writeout. If there was an IO error, return it.
@@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
305} 321}
306EXPORT_SYMBOL(sync_page_range); 322EXPORT_SYMBOL(sync_page_range);
307 323
308/* 324/**
325 * sync_page_range_nolock
326 * @inode: target inode
327 * @mapping: target address_space
328 * @pos: beginning offset in pages to write
329 * @count: number of bytes to write
330 *
309 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea 331 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
310 * as it forces O_SYNC writers to different parts of the same file 332 * as it forces O_SYNC writers to different parts of the same file
311 * to be serialised right until io completion. 333 * to be serialised right until io completion.
@@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
329EXPORT_SYMBOL(sync_page_range_nolock); 351EXPORT_SYMBOL(sync_page_range_nolock);
330 352
331/** 353/**
332 * filemap_fdatawait - walk the list of under-writeback pages of the given 354 * filemap_fdatawait - wait for all under-writeback pages to complete
333 * address space and wait for all of them.
334 *
335 * @mapping: address space structure to wait for 355 * @mapping: address space structure to wait for
356 *
357 * Walk the list of under-writeback pages of the given address space
358 * and wait for all of them.
336 */ 359 */
337int filemap_fdatawait(struct address_space *mapping) 360int filemap_fdatawait(struct address_space *mapping)
338{ 361{
@@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping)
368} 391}
369EXPORT_SYMBOL(filemap_write_and_wait); 392EXPORT_SYMBOL(filemap_write_and_wait);
370 393
371/* 394/**
395 * filemap_write_and_wait_range - write out & wait on a file range
396 * @mapping: the address_space for the pages
397 * @lstart: offset in bytes where the range starts
398 * @lend: offset in bytes where the range ends (inclusive)
399 *
372 * Write out and wait upon file offsets lstart->lend, inclusive. 400 * Write out and wait upon file offsets lstart->lend, inclusive.
373 * 401 *
374 * Note that `lend' is inclusive (describes the last byte to be written) so 402 * Note that `lend' is inclusive (describes the last byte to be written) so
@@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping,
394 return err; 422 return err;
395} 423}
396 424
397/* 425/**
398 * This function is used to add newly allocated pagecache pages: 426 * add_to_page_cache - add newly allocated pagecache pages
427 * @page: page to add
428 * @mapping: the page's address_space
429 * @offset: page index
430 * @gfp_mask: page allocation mode
431 *
432 * This function is used to add newly allocated pagecache pages;
399 * the page is new, so we can just run SetPageLocked() against it. 433 * the page is new, so we can just run SetPageLocked() against it.
400 * The other page state flags were set by rmqueue(). 434 * The other page state flags were set by rmqueue().
401 * 435 *
@@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
422 } 456 }
423 return error; 457 return error;
424} 458}
425
426EXPORT_SYMBOL(add_to_page_cache); 459EXPORT_SYMBOL(add_to_page_cache);
427 460
428int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 461int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr)
489EXPORT_SYMBOL(wait_on_page_bit); 522EXPORT_SYMBOL(wait_on_page_bit);
490 523
491/** 524/**
492 * unlock_page() - unlock a locked page 525 * unlock_page - unlock a locked page
493 *
494 * @page: the page 526 * @page: the page
495 * 527 *
496 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 528 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
@@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page)
513} 545}
514EXPORT_SYMBOL(unlock_page); 546EXPORT_SYMBOL(unlock_page);
515 547
516/* 548/**
517 * End writeback against a page. 549 * end_page_writeback - end writeback against a page
550 * @page: the page
518 */ 551 */
519void end_page_writeback(struct page *page) 552void end_page_writeback(struct page *page)
520{ 553{
@@ -527,10 +560,11 @@ void end_page_writeback(struct page *page)
527} 560}
528EXPORT_SYMBOL(end_page_writeback); 561EXPORT_SYMBOL(end_page_writeback);
529 562
530/* 563/**
531 * Get a lock on the page, assuming we need to sleep to get it. 564 * __lock_page - get a lock on the page, assuming we need to sleep to get it
565 * @page: the page to lock
532 * 566 *
533 * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some 567 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
534 * random driver's requestfn sets TASK_RUNNING, we could busywait. However 568 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
535 * chances are that on the second loop, the block layer's plug list is empty, 569 * chances are that on the second loop, the block layer's plug list is empty,
536 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 570 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
@@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page)
544} 578}
545EXPORT_SYMBOL(__lock_page); 579EXPORT_SYMBOL(__lock_page);
546 580
547/* 581/**
548 * a rather lightweight function, finding and getting a reference to a 582 * find_get_page - find and get a page reference
583 * @mapping: the address_space to search
584 * @offset: the page index
585 *
586 * A rather lightweight function, finding and getting a reference to a
549 * hashed page atomically. 587 * hashed page atomically.
550 */ 588 */
551struct page * find_get_page(struct address_space *mapping, unsigned long offset) 589struct page * find_get_page(struct address_space *mapping, unsigned long offset)
@@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
559 read_unlock_irq(&mapping->tree_lock); 597 read_unlock_irq(&mapping->tree_lock);
560 return page; 598 return page;
561} 599}
562
563EXPORT_SYMBOL(find_get_page); 600EXPORT_SYMBOL(find_get_page);
564 601
565/* 602/**
566 * Same as above, but trylock it instead of incrementing the count. 603 * find_trylock_page - find and lock a page
604 * @mapping: the address_space to search
605 * @offset: the page index
606 *
607 * Same as find_get_page(), but trylock it instead of incrementing the count.
567 */ 608 */
568struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) 609struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
569{ 610{
@@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
576 read_unlock_irq(&mapping->tree_lock); 617 read_unlock_irq(&mapping->tree_lock);
577 return page; 618 return page;
578} 619}
579
580EXPORT_SYMBOL(find_trylock_page); 620EXPORT_SYMBOL(find_trylock_page);
581 621
582/** 622/**
583 * find_lock_page - locate, pin and lock a pagecache page 623 * find_lock_page - locate, pin and lock a pagecache page
584 *
585 * @mapping: the address_space to search 624 * @mapping: the address_space to search
586 * @offset: the page index 625 * @offset: the page index
587 * 626 *
@@ -617,12 +656,10 @@ repeat:
617 read_unlock_irq(&mapping->tree_lock); 656 read_unlock_irq(&mapping->tree_lock);
618 return page; 657 return page;
619} 658}
620
621EXPORT_SYMBOL(find_lock_page); 659EXPORT_SYMBOL(find_lock_page);
622 660
623/** 661/**
624 * find_or_create_page - locate or add a pagecache page 662 * find_or_create_page - locate or add a pagecache page
625 *
626 * @mapping: the page's address_space 663 * @mapping: the page's address_space
627 * @index: the page's index into the mapping 664 * @index: the page's index into the mapping
628 * @gfp_mask: page allocation mode 665 * @gfp_mask: page allocation mode
@@ -663,7 +700,6 @@ repeat:
663 page_cache_release(cached_page); 700 page_cache_release(cached_page);
664 return page; 701 return page;
665} 702}
666
667EXPORT_SYMBOL(find_or_create_page); 703EXPORT_SYMBOL(find_or_create_page);
668 704
669/** 705/**
@@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
729 return i; 765 return i;
730} 766}
731 767
732/* 768/**
769 * find_get_pages_tag - find and return pages that match @tag
770 * @mapping: the address_space to search
771 * @index: the starting page index
772 * @tag: the tag index
773 * @nr_pages: the maximum number of pages
774 * @pages: where the resulting pages are placed
775 *
733 * Like find_get_pages, except we only return pages which are tagged with 776 * Like find_get_pages, except we only return pages which are tagged with
734 * `tag'. We update *index to index the next page for the traversal. 777 * @tag. We update @index to index the next page for the traversal.
735 */ 778 */
736unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 779unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
737 int tag, unsigned int nr_pages, struct page **pages) 780 int tag, unsigned int nr_pages, struct page **pages)
@@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
750 return ret; 793 return ret;
751} 794}
752 795
753/* 796/**
797 * grab_cache_page_nowait - returns locked page at given index in given cache
798 * @mapping: target address_space
799 * @index: the page index
800 *
754 * Same as grab_cache_page, but do not wait if the page is unavailable. 801 * Same as grab_cache_page, but do not wait if the page is unavailable.
755 * This is intended for speculative data generators, where the data can 802 * This is intended for speculative data generators, where the data can
756 * be regenerated if the page couldn't be grabbed. This routine should 803 * be regenerated if the page couldn't be grabbed. This routine should
@@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
779 } 826 }
780 return page; 827 return page;
781} 828}
782
783EXPORT_SYMBOL(grab_cache_page_nowait); 829EXPORT_SYMBOL(grab_cache_page_nowait);
784 830
785/* 831/**
832 * do_generic_mapping_read - generic file read routine
833 * @mapping: address_space to be read
834 * @_ra: file's readahead state
835 * @filp: the file to read
836 * @ppos: current file position
837 * @desc: read_descriptor
838 * @actor: read method
839 *
786 * This is a generic file read routine, and uses the 840 * This is a generic file read routine, and uses the
787 * mapping->a_ops->readpage() function for the actual low-level 841 * mapping->a_ops->readpage() function for the actual low-level stuff.
788 * stuff.
789 * 842 *
790 * This is really ugly. But the goto's actually try to clarify some 843 * This is really ugly. But the goto's actually try to clarify some
791 * of the logic when it comes to error handling etc. 844 * of the logic when it comes to error handling etc.
792 * 845 *
793 * Note the struct file* is only passed for the use of readpage. It may be 846 * Note the struct file* is only passed for the use of readpage.
794 * NULL. 847 * It may be NULL.
795 */ 848 */
796void do_generic_mapping_read(struct address_space *mapping, 849void do_generic_mapping_read(struct address_space *mapping,
797 struct file_ra_state *_ra, 850 struct file_ra_state *_ra,
@@ -1004,7 +1057,6 @@ out:
1004 if (filp) 1057 if (filp)
1005 file_accessed(filp); 1058 file_accessed(filp);
1006} 1059}
1007
1008EXPORT_SYMBOL(do_generic_mapping_read); 1060EXPORT_SYMBOL(do_generic_mapping_read);
1009 1061
1010int file_read_actor(read_descriptor_t *desc, struct page *page, 1062int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1045,7 +1097,13 @@ success:
1045 return size; 1097 return size;
1046} 1098}
1047 1099
1048/* 1100/**
1101 * __generic_file_aio_read - generic filesystem read routine
1102 * @iocb: kernel I/O control block
1103 * @iov: io vector request
1104 * @nr_segs: number of segments in the iovec
1105 * @ppos: current file position
1106 *
1049 * This is the "read()" routine for all filesystems 1107 * This is the "read()" routine for all filesystems
1050 * that can use the page cache directly. 1108 * that can use the page cache directly.
1051 */ 1109 */
@@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1124out: 1182out:
1125 return retval; 1183 return retval;
1126} 1184}
1127
1128EXPORT_SYMBOL(__generic_file_aio_read); 1185EXPORT_SYMBOL(__generic_file_aio_read);
1129 1186
1130ssize_t 1187ssize_t
@@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
1135 BUG_ON(iocb->ki_pos != pos); 1192 BUG_ON(iocb->ki_pos != pos);
1136 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); 1193 return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1137} 1194}
1138
1139EXPORT_SYMBOL(generic_file_aio_read); 1195EXPORT_SYMBOL(generic_file_aio_read);
1140 1196
1141ssize_t 1197ssize_t
@@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo
1151 ret = wait_on_sync_kiocb(&kiocb); 1207 ret = wait_on_sync_kiocb(&kiocb);
1152 return ret; 1208 return ret;
1153} 1209}
1154
1155EXPORT_SYMBOL(generic_file_read); 1210EXPORT_SYMBOL(generic_file_read);
1156 1211
1157int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) 1212int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
@@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1192 return desc.written; 1247 return desc.written;
1193 return desc.error; 1248 return desc.error;
1194} 1249}
1195
1196EXPORT_SYMBOL(generic_file_sendfile); 1250EXPORT_SYMBOL(generic_file_sendfile);
1197 1251
1198static ssize_t 1252static ssize_t
@@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1228} 1282}
1229 1283
1230#ifdef CONFIG_MMU 1284#ifdef CONFIG_MMU
1231/* 1285static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1286/**
1287 * page_cache_read - adds requested page to the page cache if not already there
1288 * @file: file to read
1289 * @offset: page index
1290 *
1232 * This adds the requested page to the page cache if it isn't already there, 1291 * This adds the requested page to the page cache if it isn't already there,
1233 * and schedules an I/O to read in its contents from disk. 1292 * and schedules an I/O to read in its contents from disk.
1234 */ 1293 */
1235static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1236static int fastcall page_cache_read(struct file * file, unsigned long offset) 1294static int fastcall page_cache_read(struct file * file, unsigned long offset)
1237{ 1295{
1238 struct address_space *mapping = file->f_mapping; 1296 struct address_space *mapping = file->f_mapping;
@@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
1259 1317
1260#define MMAP_LOTSAMISS (100) 1318#define MMAP_LOTSAMISS (100)
1261 1319
1262/* 1320/**
1321 * filemap_nopage - read in file data for page fault handling
1322 * @area: the applicable vm_area
1323 * @address: target address to read in
1324 * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
1325 *
1263 * filemap_nopage() is invoked via the vma operations vector for a 1326 * filemap_nopage() is invoked via the vma operations vector for a
1264 * mapped memory region to read in file data during a page fault. 1327 * mapped memory region to read in file data during a page fault.
1265 * 1328 *
@@ -1462,7 +1525,6 @@ page_not_uptodate:
1462 page_cache_release(page); 1525 page_cache_release(page);
1463 return NULL; 1526 return NULL;
1464} 1527}
1465
1466EXPORT_SYMBOL(filemap_nopage); 1528EXPORT_SYMBOL(filemap_nopage);
1467 1529
1468static struct page * filemap_getpage(struct file *file, unsigned long pgoff, 1530static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
@@ -1716,7 +1778,13 @@ repeat:
1716 return page; 1778 return page;
1717} 1779}
1718 1780
1719/* 1781/**
1782 * read_cache_page - read into page cache, fill it if needed
1783 * @mapping: the page's address_space
1784 * @index: the page index
1785 * @filler: function to perform the read
1786 * @data: destination for read data
1787 *
1720 * Read into the page cache. If a page already exists, 1788 * Read into the page cache. If a page already exists,
1721 * and PageUptodate() is not set, try to fill the page. 1789 * and PageUptodate() is not set, try to fill the page.
1722 */ 1790 */
@@ -1754,7 +1822,6 @@ retry:
1754 out: 1822 out:
1755 return page; 1823 return page;
1756} 1824}
1757
1758EXPORT_SYMBOL(read_cache_page); 1825EXPORT_SYMBOL(read_cache_page);
1759 1826
1760/* 1827/*
@@ -1835,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
1835 int copy = min(bytes, iov->iov_len - base); 1902 int copy = min(bytes, iov->iov_len - base);
1836 1903
1837 base = 0; 1904 base = 0;
1838 left = __copy_from_user_inatomic(vaddr, buf, copy); 1905 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1839 copied += copy; 1906 copied += copy;
1840 bytes -= copy; 1907 bytes -= copy;
1841 vaddr += copy; 1908 vaddr += copy;
@@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
1854/* 1921/*
1855 * Performs necessary checks before doing a write 1922 * Performs necessary checks before doing a write
1856 * 1923 *
1857 * Can adjust writing position aor amount of bytes to write. 1924 * Can adjust writing position or amount of bytes to write.
1858 * Returns appropriate error code that caller should return or 1925 * Returns appropriate error code that caller should return or
1859 * zero in case that write should be allowed. 1926 * zero in case that write should be allowed.
1860 */ 1927 */
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..5683cde22055 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/config.h> 15#include <linux/config.h>
16#include <asm/uaccess.h> 16#include <linux/uaccess.h>
17 17
18size_t 18size_t
19__filemap_copy_from_user_iovec(char *vaddr, 19__filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
34 int left; 34 int left;
35 35
36 kaddr = kmap_atomic(page, KM_USER0); 36 kaddr = kmap_atomic(page, KM_USER0);
37 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 37 left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
38 kunmap_atomic(kaddr, KM_USER0); 38 kunmap_atomic(kaddr, KM_USER0);
39 39
40 if (left != 0) { 40 if (left != 0) {
41 /* Do it the slow way */ 41 /* Do it the slow way */
42 kaddr = kmap(page); 42 kaddr = kmap(page);
43 left = __copy_from_user(kaddr + offset, buf, bytes); 43 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
44 kunmap(page); 44 kunmap(page);
45 } 45 }
46 return bytes - left; 46 return bytes - left;
diff --git a/mm/fremap.c b/mm/fremap.c
index 9f381e58bf44..21b7d0cbc98c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
83 page_add_file_rmap(page); 83 page_add_file_rmap(page);
84 pte_val = *pte; 84 pte_val = *pte;
85 update_mmu_cache(vma, addr, pte_val); 85 update_mmu_cache(vma, addr, pte_val);
86 lazy_mmu_prot_update(pte_val);
86 err = 0; 87 err = 0;
87unlock: 88unlock:
88 pte_unmap_unlock(pte, ptl); 89 pte_unmap_unlock(pte, ptl);
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
114 115
115 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 116 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
116 pte_val = *pte; 117 pte_val = *pte;
117 update_mmu_cache(vma, addr, pte_val); 118 /*
119 * We don't need to run update_mmu_cache() here because the "file pte"
120 * being installed by install_file_pte() is not a real pte - it's a
121 * non-present entry (like a swap entry), noting what file offset should
122 * be mapped there when there's a fault (in a non-linear vma where
123 * that's not obvious).
124 */
118 pte_unmap_unlock(pte, ptl); 125 pte_unmap_unlock(pte, ptl);
119 err = 0; 126 err = 0;
120out: 127out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26unsigned long max_huge_pages; 26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
123static struct page *alloc_huge_page(struct vm_area_struct *vma, 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr) 124 unsigned long addr)
125{ 125{
126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page; 126 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
130 127
131 spin_lock(&hugetlb_lock); 128 spin_lock(&hugetlb_lock);
132 129 if (vma->vm_flags & VM_MAYSHARE)
133 if (vma->vm_flags & VM_MAYSHARE) { 130 resv_huge_pages--;
134 131 else if (free_huge_pages <= resv_huge_pages)
135 /* idx = radix tree index, i.e. offset into file in 132 goto fail;
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159 133
160 page = dequeue_huge_page(vma, addr); 134 page = dequeue_huge_page(vma, addr);
161 if (!page) 135 if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
165 set_page_refcounted(page); 139 set_page_refcounted(page);
166 return page; 140 return page;
167 141
168 fail: 142fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock); 143 spin_unlock(&hugetlb_lock);
171 return NULL; 144 return NULL;
172} 145}
173 146
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
189
190 spin_lock(&hugetlb_lock);
191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
211 spin_unlock(&hugetlb_lock);
212
213 return ret;
214}
215
216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
230 struct page *page;
231
232 spin_lock(&hugetlb_lock);
233 read_lock_irq(&inode->i_mapping->tree_lock);
234
235 if (info->prereserved_hpages <= atmost)
236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
255 spin_unlock(&hugetlb_lock);
256}
257
258static int __init hugetlb_init(void) 147static int __init hugetlb_init(void)
259{ 148{
260 unsigned long i; 149 unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
334 return nr_huge_pages; 223 return nr_huge_pages;
335 224
336 spin_lock(&hugetlb_lock); 225 spin_lock(&hugetlb_lock);
337 count = max(count, reserved_huge_pages); 226 count = max(count, resv_huge_pages);
338 try_to_free_low(count); 227 try_to_free_low(count);
339 while (count < nr_huge_pages) { 228 while (count < nr_huge_pages) {
340 struct page *page = dequeue_huge_page(NULL, 0); 229 struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
361 return sprintf(buf, 250 return sprintf(buf,
362 "HugePages_Total: %5lu\n" 251 "HugePages_Total: %5lu\n"
363 "HugePages_Free: %5lu\n" 252 "HugePages_Free: %5lu\n"
364 "HugePages_Rsvd: %5lu\n" 253 "HugePages_Rsvd: %5lu\n"
365 "Hugepagesize: %5lu kB\n", 254 "Hugepagesize: %5lu kB\n",
366 nr_huge_pages, 255 nr_huge_pages,
367 free_huge_pages, 256 free_huge_pages,
368 reserved_huge_pages, 257 resv_huge_pages,
369 HPAGE_SIZE/1024); 258 HPAGE_SIZE/1024);
370} 259}
371 260
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
754 flush_tlb_range(vma, start, end); 643 flush_tlb_range(vma, start, end);
755} 644}
756 645
646struct file_region {
647 struct list_head link;
648 long from;
649 long to;
650};
651
652static long region_add(struct list_head *head, long f, long t)
653{
654 struct file_region *rg, *nrg, *trg;
655
656 /* Locate the region we are either in or before. */
657 list_for_each_entry(rg, head, link)
658 if (f <= rg->to)
659 break;
660
661 /* Round our left edge to the current segment if it encloses us. */
662 if (f > rg->from)
663 f = rg->from;
664
665 /* Check for and consume any regions we now overlap with. */
666 nrg = rg;
667 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
668 if (&rg->link == head)
669 break;
670 if (rg->from > t)
671 break;
672
673 /* If this area reaches higher then extend our area to
674 * include it completely. If this is not the first area
675 * which we intend to reuse, free it. */
676 if (rg->to > t)
677 t = rg->to;
678 if (rg != nrg) {
679 list_del(&rg->link);
680 kfree(rg);
681 }
682 }
683 nrg->from = f;
684 nrg->to = t;
685 return 0;
686}
687
688static long region_chg(struct list_head *head, long f, long t)
689{
690 struct file_region *rg, *nrg;
691 long chg = 0;
692
693 /* Locate the region we are before or in. */
694 list_for_each_entry(rg, head, link)
695 if (f <= rg->to)
696 break;
697
698 /* If we are below the current region then a new region is required.
699 * Subtle, allocate a new region at the position but make it zero
700 * size such that we can guarentee to record the reservation. */
701 if (&rg->link == head || t < rg->from) {
702 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
703 if (nrg == 0)
704 return -ENOMEM;
705 nrg->from = f;
706 nrg->to = f;
707 INIT_LIST_HEAD(&nrg->link);
708 list_add(&nrg->link, rg->link.prev);
709
710 return t - f;
711 }
712
713 /* Round our left edge to the current segment if it encloses us. */
714 if (f > rg->from)
715 f = rg->from;
716 chg = t - f;
717
718 /* Check for and consume any regions we now overlap with. */
719 list_for_each_entry(rg, rg->link.prev, link) {
720 if (&rg->link == head)
721 break;
722 if (rg->from > t)
723 return chg;
724
725 /* We overlap with this area, if it extends futher than
726 * us then we must extend ourselves. Account for its
727 * existing reservation. */
728 if (rg->to > t) {
729 chg += rg->to - t;
730 t = rg->to;
731 }
732 chg -= rg->to - rg->from;
733 }
734 return chg;
735}
736
737static long region_truncate(struct list_head *head, long end)
738{
739 struct file_region *rg, *trg;
740 long chg = 0;
741
742 /* Locate the region we are either in or before. */
743 list_for_each_entry(rg, head, link)
744 if (end <= rg->to)
745 break;
746 if (&rg->link == head)
747 return 0;
748
749 /* If we are in the middle of a region then adjust it. */
750 if (end > rg->from) {
751 chg = rg->to - end;
752 rg->to = end;
753 rg = list_entry(rg->link.next, typeof(*rg), link);
754 }
755
756 /* Drop any remaining regions. */
757 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
758 if (&rg->link == head)
759 break;
760 chg += rg->to - rg->from;
761 list_del(&rg->link);
762 kfree(rg);
763 }
764 return chg;
765}
766
767static int hugetlb_acct_memory(long delta)
768{
769 int ret = -ENOMEM;
770
771 spin_lock(&hugetlb_lock);
772 if ((delta + resv_huge_pages) <= free_huge_pages) {
773 resv_huge_pages += delta;
774 ret = 0;
775 }
776 spin_unlock(&hugetlb_lock);
777 return ret;
778}
779
780int hugetlb_reserve_pages(struct inode *inode, long from, long to)
781{
782 long ret, chg;
783
784 chg = region_chg(&inode->i_mapping->private_list, from, to);
785 if (chg < 0)
786 return chg;
787 ret = hugetlb_acct_memory(chg);
788 if (ret < 0)
789 return ret;
790 region_add(&inode->i_mapping->private_list, from, to);
791 return 0;
792}
793
794void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
795{
796 long chg = region_truncate(&inode->i_mapping->private_list, offset);
797 hugetlb_acct_memory(freed - chg);
798}
diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc644271..247b5c312b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
434 /* pte contains position in swap or file, so copy. */ 434 /* pte contains position in swap or file, so copy. */
435 if (unlikely(!pte_present(pte))) { 435 if (unlikely(!pte_present(pte))) {
436 if (!pte_file(pte)) { 436 if (!pte_file(pte)) {
437 swap_duplicate(pte_to_swp_entry(pte)); 437 swp_entry_t entry = pte_to_swp_entry(pte);
438
439 swap_duplicate(entry);
438 /* make sure dst_mm is on swapoff's mmlist. */ 440 /* make sure dst_mm is on swapoff's mmlist. */
439 if (unlikely(list_empty(&dst_mm->mmlist))) { 441 if (unlikely(list_empty(&dst_mm->mmlist))) {
440 spin_lock(&mmlist_lock); 442 spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
443 &src_mm->mmlist); 445 &src_mm->mmlist);
444 spin_unlock(&mmlist_lock); 446 spin_unlock(&mmlist_lock);
445 } 447 }
448 if (is_write_migration_entry(entry) &&
449 is_cow_mapping(vm_flags)) {
450 /*
451 * COW mappings require pages in both parent
452 * and child to be set to read.
453 */
454 make_migration_entry_read(&entry);
455 pte = swp_entry_to_pte(entry);
456 set_pte_at(src_mm, addr, src_pte, pte);
457 }
446 } 458 }
447 goto out_set_pte; 459 goto out_set_pte;
448 } 460 }
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1445{ 1457{
1446 struct page *old_page, *new_page; 1458 struct page *old_page, *new_page;
1447 pte_t entry; 1459 pte_t entry;
1448 int ret = VM_FAULT_MINOR; 1460 int reuse, ret = VM_FAULT_MINOR;
1449 1461
1450 old_page = vm_normal_page(vma, address, orig_pte); 1462 old_page = vm_normal_page(vma, address, orig_pte);
1451 if (!old_page) 1463 if (!old_page)
1452 goto gotten; 1464 goto gotten;
1453 1465
1454 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1466 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
1455 int reuse = can_share_swap_page(old_page); 1467 (VM_SHARED|VM_WRITE))) {
1456 unlock_page(old_page); 1468 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1457 if (reuse) { 1469 /*
1458 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1470 * Notify the address space that the page is about to
1459 entry = pte_mkyoung(orig_pte); 1471 * become writable so that it can prohibit this or wait
1460 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1472 * for the page to get into an appropriate state.
1461 ptep_set_access_flags(vma, address, page_table, entry, 1); 1473 *
1462 update_mmu_cache(vma, address, entry); 1474 * We do this without the lock held, so that it can
1463 lazy_mmu_prot_update(entry); 1475 * sleep if it needs to.
1464 ret |= VM_FAULT_WRITE; 1476 */
1465 goto unlock; 1477 page_cache_get(old_page);
1478 pte_unmap_unlock(page_table, ptl);
1479
1480 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1481 goto unwritable_page;
1482
1483 page_cache_release(old_page);
1484
1485 /*
1486 * Since we dropped the lock we need to revalidate
1487 * the PTE as someone else may have changed it. If
1488 * they did, we just return, as we can count on the
1489 * MMU to tell us if they didn't also make it writable.
1490 */
1491 page_table = pte_offset_map_lock(mm, pmd, address,
1492 &ptl);
1493 if (!pte_same(*page_table, orig_pte))
1494 goto unlock;
1466 } 1495 }
1496
1497 reuse = 1;
1498 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1499 reuse = can_share_swap_page(old_page);
1500 unlock_page(old_page);
1501 } else {
1502 reuse = 0;
1503 }
1504
1505 if (reuse) {
1506 flush_cache_page(vma, address, pte_pfn(orig_pte));
1507 entry = pte_mkyoung(orig_pte);
1508 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1509 ptep_set_access_flags(vma, address, page_table, entry, 1);
1510 update_mmu_cache(vma, address, entry);
1511 lazy_mmu_prot_update(entry);
1512 ret |= VM_FAULT_WRITE;
1513 goto unlock;
1467 } 1514 }
1468 1515
1469 /* 1516 /*
@@ -1523,6 +1570,10 @@ oom:
1523 if (old_page) 1570 if (old_page)
1524 page_cache_release(old_page); 1571 page_cache_release(old_page);
1525 return VM_FAULT_OOM; 1572 return VM_FAULT_OOM;
1573
1574unwritable_page:
1575 page_cache_release(old_page);
1576 return VM_FAULT_SIGBUS;
1526} 1577}
1527 1578
1528/* 1579/*
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1879 goto out; 1930 goto out;
1880 1931
1881 entry = pte_to_swp_entry(orig_pte); 1932 entry = pte_to_swp_entry(orig_pte);
1882again: 1933 if (is_migration_entry(entry)) {
1934 migration_entry_wait(mm, pmd, address);
1935 goto out;
1936 }
1883 page = lookup_swap_cache(entry); 1937 page = lookup_swap_cache(entry);
1884 if (!page) { 1938 if (!page) {
1885 swapin_readahead(entry, address, vma); 1939 swapin_readahead(entry, address, vma);
@@ -1903,12 +1957,6 @@ again:
1903 1957
1904 mark_page_accessed(page); 1958 mark_page_accessed(page);
1905 lock_page(page); 1959 lock_page(page);
1906 if (!PageSwapCache(page)) {
1907 /* Page migration has occured */
1908 unlock_page(page);
1909 page_cache_release(page);
1910 goto again;
1911 }
1912 1960
1913 /* 1961 /*
1914 * Back out if somebody else already faulted in this pte. 1962 * Back out if somebody else already faulted in this pte.
@@ -2074,18 +2122,31 @@ retry:
2074 /* 2122 /*
2075 * Should we do an early C-O-W break? 2123 * Should we do an early C-O-W break?
2076 */ 2124 */
2077 if (write_access && !(vma->vm_flags & VM_SHARED)) { 2125 if (write_access) {
2078 struct page *page; 2126 if (!(vma->vm_flags & VM_SHARED)) {
2127 struct page *page;
2079 2128
2080 if (unlikely(anon_vma_prepare(vma))) 2129 if (unlikely(anon_vma_prepare(vma)))
2081 goto oom; 2130 goto oom;
2082 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 2131 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2083 if (!page) 2132 if (!page)
2084 goto oom; 2133 goto oom;
2085 copy_user_highpage(page, new_page, address); 2134 copy_user_highpage(page, new_page, address);
2086 page_cache_release(new_page); 2135 page_cache_release(new_page);
2087 new_page = page; 2136 new_page = page;
2088 anon = 1; 2137 anon = 1;
2138
2139 } else {
2140 /* if the page will be shareable, see if the backing
2141 * address space wants to know that the page is about
2142 * to become writable */
2143 if (vma->vm_ops->page_mkwrite &&
2144 vma->vm_ops->page_mkwrite(vma, new_page) < 0
2145 ) {
2146 page_cache_release(new_page);
2147 return VM_FAULT_SIGBUS;
2148 }
2149 }
2089 } 2150 }
2090 2151
2091 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2152 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957e..841a077d5aeb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,7 +26,7 @@
26 26
27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 27extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
28 unsigned long size); 28 unsigned long size);
29static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) 29static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
30{ 30{
31 struct pglist_data *pgdat = zone->zone_pgdat; 31 struct pglist_data *pgdat = zone->zone_pgdat;
32 int nr_pages = PAGES_PER_SECTION; 32 int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
34 int zone_type; 34 int zone_type;
35 35
36 zone_type = zone - pgdat->node_zones; 36 zone_type = zone - pgdat->node_zones;
37 if (!populated_zone(zone)) {
38 int ret = 0;
39 ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
40 if (ret < 0)
41 return ret;
42 }
37 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 43 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
38 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); 44 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
45 return 0;
39} 46}
40 47
41extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 48extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
50 if (ret < 0) 57 if (ret < 0)
51 return ret; 58 return ret;
52 59
53 __add_zone(zone, phys_start_pfn); 60 ret = __add_zone(zone, phys_start_pfn);
61
62 if (ret < 0)
63 return ret;
64
54 return register_new_memory(__pfn_to_section(phys_start_pfn)); 65 return register_new_memory(__pfn_to_section(phys_start_pfn));
55} 66}
56 67
@@ -116,6 +127,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
116 unsigned long flags; 127 unsigned long flags;
117 unsigned long onlined_pages = 0; 128 unsigned long onlined_pages = 0;
118 struct zone *zone; 129 struct zone *zone;
130 int need_zonelists_rebuild = 0;
119 131
120 /* 132 /*
121 * This doesn't need a lock to do pfn_to_page(). 133 * This doesn't need a lock to do pfn_to_page().
@@ -128,6 +140,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
128 grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); 140 grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
129 pgdat_resize_unlock(zone->zone_pgdat, &flags); 141 pgdat_resize_unlock(zone->zone_pgdat, &flags);
130 142
143 /*
144 * If this zone is not populated, then it is not in zonelist.
145 * This means the page allocator ignores this zone.
146 * So, zonelist must be updated after online.
147 */
148 if (!populated_zone(zone))
149 need_zonelists_rebuild = 1;
150
131 for (i = 0; i < nr_pages; i++) { 151 for (i = 0; i < nr_pages; i++) {
132 struct page *page = pfn_to_page(pfn + i); 152 struct page *page = pfn_to_page(pfn + i);
133 online_page(page); 153 online_page(page);
@@ -138,5 +158,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
138 158
139 setup_per_zone_pages_min(); 159 setup_per_zone_pages_min();
140 160
161 if (need_zonelists_rebuild)
162 build_all_zonelists();
163 vm_total_pages = nr_free_pagecache_pages();
141 return 0; 164 return 0;
142} 165}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c4..ec4a1a950df9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,8 @@
87#include <linux/seq_file.h> 87#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 88#include <linux/proc_fs.h>
89#include <linux/migrate.h> 89#include <linux/migrate.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
90 92
91#include <asm/tlbflush.h> 93#include <asm/tlbflush.h>
92#include <asm/uaccess.h> 94#include <asm/uaccess.h>
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
587 isolate_lru_page(page, pagelist); 589 isolate_lru_page(page, pagelist);
588} 590}
589 591
592static struct page *new_node_page(struct page *page, unsigned long node, int **x)
593{
594 return alloc_pages_node(node, GFP_HIGHUSER, 0);
595}
596
590/* 597/*
591 * Migrate pages from one node to a target node. 598 * Migrate pages from one node to a target node.
592 * Returns error or the number of pages not migrated. 599 * Returns error or the number of pages not migrated.
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
603 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 610 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
604 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 611 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
605 612
606 if (!list_empty(&pagelist)) { 613 if (!list_empty(&pagelist))
607 err = migrate_pages_to(&pagelist, NULL, dest); 614 err = migrate_pages(&pagelist, new_node_page, dest);
608 if (!list_empty(&pagelist)) 615
609 putback_lru_pages(&pagelist);
610 }
611 return err; 616 return err;
612} 617}
613 618
@@ -694,6 +699,12 @@ int do_migrate_pages(struct mm_struct *mm,
694 699
695} 700}
696 701
702static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
703{
704 struct vm_area_struct *vma = (struct vm_area_struct *)private;
705
706 return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
707}
697#else 708#else
698 709
699static void migrate_page_add(struct page *page, struct list_head *pagelist, 710static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -706,6 +717,11 @@ int do_migrate_pages(struct mm_struct *mm,
706{ 717{
707 return -ENOSYS; 718 return -ENOSYS;
708} 719}
720
721static struct page *new_vma_page(struct page *page, unsigned long private)
722{
723 return NULL;
724}
709#endif 725#endif
710 726
711long do_mbind(unsigned long start, unsigned long len, 727long do_mbind(unsigned long start, unsigned long len,
@@ -767,15 +783,13 @@ long do_mbind(unsigned long start, unsigned long len,
767 err = mbind_range(vma, start, end, new); 783 err = mbind_range(vma, start, end, new);
768 784
769 if (!list_empty(&pagelist)) 785 if (!list_empty(&pagelist))
770 nr_failed = migrate_pages_to(&pagelist, vma, -1); 786 nr_failed = migrate_pages(&pagelist, new_vma_page,
787 (unsigned long)vma);
771 788
772 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 789 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
773 err = -EIO; 790 err = -EIO;
774 } 791 }
775 792
776 if (!list_empty(&pagelist))
777 putback_lru_pages(&pagelist);
778
779 up_write(&mm->mmap_sem); 793 up_write(&mm->mmap_sem);
780 mpol_free(new); 794 mpol_free(new);
781 return err; 795 return err;
@@ -929,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
929 goto out; 943 goto out;
930 } 944 }
931 945
946 err = security_task_movememory(task);
947 if (err)
948 goto out;
949
932 err = do_migrate_pages(mm, &old, &new, 950 err = do_migrate_pages(mm, &old, &new,
933 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 951 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
934out: 952out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c25040693d2..1c2a71aa05cd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
15#include <linux/migrate.h> 15#include <linux/migrate.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/swapops.h>
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
20#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
@@ -23,13 +24,13 @@
23#include <linux/topology.h> 24#include <linux/topology.h>
24#include <linux/cpu.h> 25#include <linux/cpu.h>
25#include <linux/cpuset.h> 26#include <linux/cpuset.h>
26#include <linux/swapops.h> 27#include <linux/writeback.h>
28#include <linux/mempolicy.h>
29#include <linux/vmalloc.h>
30#include <linux/security.h>
27 31
28#include "internal.h" 32#include "internal.h"
29 33
30/* The maximum number of pages to take off the LRU for migration */
31#define MIGRATE_CHUNK_SIZE 256
32
33#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 34#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
34 35
35/* 36/*
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
64} 65}
65 66
66/* 67/*
67 * migrate_prep() needs to be called after we have compiled the list of pages 68 * migrate_prep() needs to be called before we start compiling a list of pages
68 * to be migrated using isolate_lru_page() but before we begin a series of calls 69 * to be migrated using isolate_lru_page().
69 * to migrate_pages().
70 */ 70 */
71int migrate_prep(void) 71int migrate_prep(void)
72{ 72{
73 /* Must have swap device for migration */
74 if (nr_swap_pages <= 0)
75 return -ENODEV;
76
77 /* 73 /*
78 * Clear the LRU lists so pages can be isolated. 74 * Clear the LRU lists so pages can be isolated.
79 * Note that pages may be moved off the LRU after we have 75 * Note that pages may be moved off the LRU after we have
@@ -87,7 +83,6 @@ int migrate_prep(void)
87 83
88static inline void move_to_lru(struct page *page) 84static inline void move_to_lru(struct page *page)
89{ 85{
90 list_del(&page->lru);
91 if (PageActive(page)) { 86 if (PageActive(page)) {
92 /* 87 /*
93 * lru_cache_add_active checks that 88 * lru_cache_add_active checks that
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l)
113 int count = 0; 108 int count = 0;
114 109
115 list_for_each_entry_safe(page, page2, l, lru) { 110 list_for_each_entry_safe(page, page2, l, lru) {
111 list_del(&page->lru);
116 move_to_lru(page); 112 move_to_lru(page);
117 count++; 113 count++;
118 } 114 }
119 return count; 115 return count;
120} 116}
121 117
122/* 118static inline int is_swap_pte(pte_t pte)
123 * Non migratable page
124 */
125int fail_migrate_page(struct page *newpage, struct page *page)
126{ 119{
127 return -EIO; 120 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
128} 121}
129EXPORT_SYMBOL(fail_migrate_page);
130 122
131/* 123/*
132 * swapout a single page 124 * Restore a potential migration pte to a working pte entry
133 * page is locked upon entry, unlocked on exit
134 */ 125 */
135static int swap_page(struct page *page) 126static void remove_migration_pte(struct vm_area_struct *vma,
127 struct page *old, struct page *new)
136{ 128{
137 struct address_space *mapping = page_mapping(page); 129 struct mm_struct *mm = vma->vm_mm;
130 swp_entry_t entry;
131 pgd_t *pgd;
132 pud_t *pud;
133 pmd_t *pmd;
134 pte_t *ptep, pte;
135 spinlock_t *ptl;
136 unsigned long addr = page_address_in_vma(new, vma);
137
138 if (addr == -EFAULT)
139 return;
140
141 pgd = pgd_offset(mm, addr);
142 if (!pgd_present(*pgd))
143 return;
144
145 pud = pud_offset(pgd, addr);
146 if (!pud_present(*pud))
147 return;
148
149 pmd = pmd_offset(pud, addr);
150 if (!pmd_present(*pmd))
151 return;
152
153 ptep = pte_offset_map(pmd, addr);
154
155 if (!is_swap_pte(*ptep)) {
156 pte_unmap(ptep);
157 return;
158 }
138 159
139 if (page_mapped(page) && mapping) 160 ptl = pte_lockptr(mm, pmd);
140 if (try_to_unmap(page, 1) != SWAP_SUCCESS) 161 spin_lock(ptl);
141 goto unlock_retry; 162 pte = *ptep;
163 if (!is_swap_pte(pte))
164 goto out;
142 165
143 if (PageDirty(page)) { 166 entry = pte_to_swp_entry(pte);
144 /* Page is dirty, try to write it out here */
145 switch(pageout(page, mapping)) {
146 case PAGE_KEEP:
147 case PAGE_ACTIVATE:
148 goto unlock_retry;
149 167
150 case PAGE_SUCCESS: 168 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
151 goto retry; 169 goto out;
152 170
153 case PAGE_CLEAN: 171 get_page(new);
154 ; /* try to free the page below */ 172 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
155 } 173 if (is_write_migration_entry(entry))
156 } 174 pte = pte_mkwrite(pte);
175 set_pte_at(mm, addr, ptep, pte);
157 176
158 if (PagePrivate(page)) { 177 if (PageAnon(new))
159 if (!try_to_release_page(page, GFP_KERNEL) || 178 page_add_anon_rmap(new, vma, addr);
160 (!mapping && page_count(page) == 1)) 179 else
161 goto unlock_retry; 180 page_add_file_rmap(new);
162 }
163 181
164 if (remove_mapping(mapping, page)) { 182 /* No need to invalidate - it was non-present before */
165 /* Success */ 183 update_mmu_cache(vma, addr, pte);
166 unlock_page(page); 184 lazy_mmu_prot_update(pte);
167 return 0;
168 }
169 185
170unlock_retry: 186out:
171 unlock_page(page); 187 pte_unmap_unlock(ptep, ptl);
188}
172 189
173retry: 190/*
174 return -EAGAIN; 191 * Note that remove_file_migration_ptes will only work on regular mappings,
192 * Nonlinear mappings do not use migration entries.
193 */
194static void remove_file_migration_ptes(struct page *old, struct page *new)
195{
196 struct vm_area_struct *vma;
197 struct address_space *mapping = page_mapping(new);
198 struct prio_tree_iter iter;
199 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
200
201 if (!mapping)
202 return;
203
204 spin_lock(&mapping->i_mmap_lock);
205
206 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
207 remove_migration_pte(vma, old, new);
208
209 spin_unlock(&mapping->i_mmap_lock);
175} 210}
176 211
177/* 212/*
178 * Remove references for a page and establish the new page with the correct 213 * Must hold mmap_sem lock on at least one of the vmas containing
179 * basic settings to be able to stop accesses to the page. 214 * the page so that the anon_vma cannot vanish.
180 */ 215 */
181int migrate_page_remove_references(struct page *newpage, 216static void remove_anon_migration_ptes(struct page *old, struct page *new)
182 struct page *page, int nr_refs)
183{ 217{
184 struct address_space *mapping = page_mapping(page); 218 struct anon_vma *anon_vma;
185 struct page **radix_pointer; 219 struct vm_area_struct *vma;
220 unsigned long mapping;
186 221
187 /* 222 mapping = (unsigned long)new->mapping;
188 * Avoid doing any of the following work if the page count
189 * indicates that the page is in use or truncate has removed
190 * the page.
191 */
192 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
193 return -EAGAIN;
194 223
195 /* 224 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
196 * Establish swap ptes for anonymous pages or destroy pte 225 return;
197 * maps for files.
198 *
199 * In order to reestablish file backed mappings the fault handlers
200 * will take the radix tree_lock which may then be used to stop
201 * processses from accessing this page until the new page is ready.
202 *
203 * A process accessing via a swap pte (an anonymous page) will take a
204 * page_lock on the old page which will block the process until the
205 * migration attempt is complete. At that time the PageSwapCache bit
206 * will be examined. If the page was migrated then the PageSwapCache
207 * bit will be clear and the operation to retrieve the page will be
208 * retried which will find the new page in the radix tree. Then a new
209 * direct mapping may be generated based on the radix tree contents.
210 *
211 * If the page was not migrated then the PageSwapCache bit
212 * is still set and the operation may continue.
213 */
214 if (try_to_unmap(page, 1) == SWAP_FAIL)
215 /* A vma has VM_LOCKED set -> permanent failure */
216 return -EPERM;
217 226
218 /* 227 /*
219 * Give up if we were unable to remove all mappings. 228 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
220 */ 229 */
221 if (page_mapcount(page)) 230 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
222 return -EAGAIN; 231 spin_lock(&anon_vma->lock);
232
233 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
234 remove_migration_pte(vma, old, new);
235
236 spin_unlock(&anon_vma->lock);
237}
238
239/*
240 * Get rid of all migration entries and replace them by
241 * references to the indicated page.
242 */
243static void remove_migration_ptes(struct page *old, struct page *new)
244{
245 if (PageAnon(new))
246 remove_anon_migration_ptes(old, new);
247 else
248 remove_file_migration_ptes(old, new);
249}
250
251/*
252 * Something used the pte of a page under migration. We need to
253 * get to the page and wait until migration is finished.
254 * When we return from this function the fault will be retried.
255 *
256 * This function is called from do_swap_page().
257 */
258void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
259 unsigned long address)
260{
261 pte_t *ptep, pte;
262 spinlock_t *ptl;
263 swp_entry_t entry;
264 struct page *page;
265
266 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
267 pte = *ptep;
268 if (!is_swap_pte(pte))
269 goto out;
270
271 entry = pte_to_swp_entry(pte);
272 if (!is_migration_entry(entry))
273 goto out;
274
275 page = migration_entry_to_page(entry);
276
277 get_page(page);
278 pte_unmap_unlock(ptep, ptl);
279 wait_on_page_locked(page);
280 put_page(page);
281 return;
282out:
283 pte_unmap_unlock(ptep, ptl);
284}
285
286/*
287 * Replace the page in the mapping.
288 *
289 * The number of remaining references must be:
290 * 1 for anonymous pages without a mapping
291 * 2 for pages with a mapping
292 * 3 for pages with a mapping and PagePrivate set.
293 */
294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page)
296{
297 struct page **radix_pointer;
298
299 if (!mapping) {
300 /* Anonymous page */
301 if (page_count(page) != 1)
302 return -EAGAIN;
303 return 0;
304 }
223 305
224 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
225 307
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage,
227 &mapping->page_tree, 309 &mapping->page_tree,
228 page_index(page)); 310 page_index(page));
229 311
230 if (!page_mapping(page) || page_count(page) != nr_refs || 312 if (page_count(page) != 2 + !!PagePrivate(page) ||
231 *radix_pointer != page) { 313 *radix_pointer != page) {
232 write_unlock_irq(&mapping->tree_lock); 314 write_unlock_irq(&mapping->tree_lock);
233 return -EAGAIN; 315 return -EAGAIN;
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage,
235 317
236 /* 318 /*
237 * Now we know that no one else is looking at the page. 319 * Now we know that no one else is looking at the page.
238 *
239 * Certain minimal information about a page must be available
240 * in order for other subsystems to properly handle the page if they
241 * find it through the radix tree update before we are finished
242 * copying the page.
243 */ 320 */
244 get_page(newpage); 321 get_page(newpage);
245 newpage->index = page->index; 322#ifdef CONFIG_SWAP
246 newpage->mapping = page->mapping;
247 if (PageSwapCache(page)) { 323 if (PageSwapCache(page)) {
248 SetPageSwapCache(newpage); 324 SetPageSwapCache(newpage);
249 set_page_private(newpage, page_private(page)); 325 set_page_private(newpage, page_private(page));
250 } 326 }
327#endif
251 328
252 *radix_pointer = newpage; 329 *radix_pointer = newpage;
253 __put_page(page); 330 __put_page(page);
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage,
255 332
256 return 0; 333 return 0;
257} 334}
258EXPORT_SYMBOL(migrate_page_remove_references);
259 335
260/* 336/*
261 * Copy the page to its new location 337 * Copy the page to its new location
262 */ 338 */
263void migrate_page_copy(struct page *newpage, struct page *page) 339static void migrate_page_copy(struct page *newpage, struct page *page)
264{ 340{
265 copy_highpage(newpage, page); 341 copy_highpage(newpage, page);
266 342
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
282 set_page_dirty(newpage); 358 set_page_dirty(newpage);
283 } 359 }
284 360
361#ifdef CONFIG_SWAP
285 ClearPageSwapCache(page); 362 ClearPageSwapCache(page);
363#endif
286 ClearPageActive(page); 364 ClearPageActive(page);
287 ClearPagePrivate(page); 365 ClearPagePrivate(page);
288 set_page_private(page, 0); 366 set_page_private(page, 0);
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page)
295 if (PageWriteback(newpage)) 373 if (PageWriteback(newpage))
296 end_page_writeback(newpage); 374 end_page_writeback(newpage);
297} 375}
298EXPORT_SYMBOL(migrate_page_copy); 376
377/************************************************************
378 * Migration functions
379 ***********************************************************/
380
381/* Always fail migration. Used for mappings that are not movable */
382int fail_migrate_page(struct address_space *mapping,
383 struct page *newpage, struct page *page)
384{
385 return -EIO;
386}
387EXPORT_SYMBOL(fail_migrate_page);
299 388
300/* 389/*
301 * Common logic to directly migrate a single page suitable for 390 * Common logic to directly migrate a single page suitable for
@@ -303,51 +392,286 @@ EXPORT_SYMBOL(migrate_page_copy);
303 * 392 *
304 * Pages are locked upon entry and exit. 393 * Pages are locked upon entry and exit.
305 */ 394 */
306int migrate_page(struct page *newpage, struct page *page) 395int migrate_page(struct address_space *mapping,
396 struct page *newpage, struct page *page)
307{ 397{
308 int rc; 398 int rc;
309 399
310 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 400 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
311 401
312 rc = migrate_page_remove_references(newpage, page, 2); 402 rc = migrate_page_move_mapping(mapping, newpage, page);
403
404 if (rc)
405 return rc;
406
407 migrate_page_copy(newpage, page);
408 return 0;
409}
410EXPORT_SYMBOL(migrate_page);
411
412/*
413 * Migration function for pages with buffers. This function can only be used
414 * if the underlying filesystem guarantees that no other references to "page"
415 * exist.
416 */
417int buffer_migrate_page(struct address_space *mapping,
418 struct page *newpage, struct page *page)
419{
420 struct buffer_head *bh, *head;
421 int rc;
422
423 if (!page_has_buffers(page))
424 return migrate_page(mapping, newpage, page);
425
426 head = page_buffers(page);
427
428 rc = migrate_page_move_mapping(mapping, newpage, page);
313 429
314 if (rc) 430 if (rc)
315 return rc; 431 return rc;
316 432
433 bh = head;
434 do {
435 get_bh(bh);
436 lock_buffer(bh);
437 bh = bh->b_this_page;
438
439 } while (bh != head);
440
441 ClearPagePrivate(page);
442 set_page_private(newpage, page_private(page));
443 set_page_private(page, 0);
444 put_page(page);
445 get_page(newpage);
446
447 bh = head;
448 do {
449 set_bh_page(bh, newpage, bh_offset(bh));
450 bh = bh->b_this_page;
451
452 } while (bh != head);
453
454 SetPagePrivate(newpage);
455
317 migrate_page_copy(newpage, page); 456 migrate_page_copy(newpage, page);
318 457
458 bh = head;
459 do {
460 unlock_buffer(bh);
461 put_bh(bh);
462 bh = bh->b_this_page;
463
464 } while (bh != head);
465
466 return 0;
467}
468EXPORT_SYMBOL(buffer_migrate_page);
469
470/*
471 * Writeback a page to clean the dirty state
472 */
473static int writeout(struct address_space *mapping, struct page *page)
474{
475 struct writeback_control wbc = {
476 .sync_mode = WB_SYNC_NONE,
477 .nr_to_write = 1,
478 .range_start = 0,
479 .range_end = LLONG_MAX,
480 .nonblocking = 1,
481 .for_reclaim = 1
482 };
483 int rc;
484
485 if (!mapping->a_ops->writepage)
486 /* No write method for the address space */
487 return -EINVAL;
488
489 if (!clear_page_dirty_for_io(page))
490 /* Someone else already triggered a write */
491 return -EAGAIN;
492
319 /* 493 /*
320 * Remove auxiliary swap entries and replace 494 * A dirty page may imply that the underlying filesystem has
321 * them with real ptes. 495 * the page on some queue. So the page must be clean for
322 * 496 * migration. Writeout may mean we loose the lock and the
323 * Note that a real pte entry will allow processes that are not 497 * page state is no longer what we checked for earlier.
324 * waiting on the page lock to use the new page via the page tables 498 * At this point we know that the migration attempt cannot
325 * before the new page is unlocked. 499 * be successful.
326 */ 500 */
327 remove_from_swap(newpage); 501 remove_migration_ptes(page, page);
328 return 0; 502
503 rc = mapping->a_ops->writepage(page, &wbc);
504 if (rc < 0)
505 /* I/O Error writing */
506 return -EIO;
507
508 if (rc != AOP_WRITEPAGE_ACTIVATE)
509 /* unlocked. Relock */
510 lock_page(page);
511
512 return -EAGAIN;
513}
514
515/*
516 * Default handling if a filesystem does not provide a migration function.
517 */
518static int fallback_migrate_page(struct address_space *mapping,
519 struct page *newpage, struct page *page)
520{
521 if (PageDirty(page))
522 return writeout(mapping, page);
523
524 /*
525 * Buffers may be managed in a filesystem specific way.
526 * We must have no buffers or drop them.
527 */
528 if (page_has_buffers(page) &&
529 !try_to_release_page(page, GFP_KERNEL))
530 return -EAGAIN;
531
532 return migrate_page(mapping, newpage, page);
533}
534
535/*
536 * Move a page to a newly allocated page
537 * The page is locked and all ptes have been successfully removed.
538 *
539 * The new page will have replaced the old page if this function
540 * is successful.
541 */
542static int move_to_new_page(struct page *newpage, struct page *page)
543{
544 struct address_space *mapping;
545 int rc;
546
547 /*
548 * Block others from accessing the page when we get around to
549 * establishing additional references. We are the only one
550 * holding a reference to the new page at this point.
551 */
552 if (TestSetPageLocked(newpage))
553 BUG();
554
555 /* Prepare mapping for the new page.*/
556 newpage->index = page->index;
557 newpage->mapping = page->mapping;
558
559 mapping = page_mapping(page);
560 if (!mapping)
561 rc = migrate_page(mapping, newpage, page);
562 else if (mapping->a_ops->migratepage)
563 /*
564 * Most pages have a mapping and most filesystems
565 * should provide a migration function. Anonymous
566 * pages are part of swap space which also has its
567 * own migration function. This is the most common
568 * path for page migration.
569 */
570 rc = mapping->a_ops->migratepage(mapping,
571 newpage, page);
572 else
573 rc = fallback_migrate_page(mapping, newpage, page);
574
575 if (!rc)
576 remove_migration_ptes(page, newpage);
577 else
578 newpage->mapping = NULL;
579
580 unlock_page(newpage);
581
582 return rc;
583}
584
585/*
586 * Obtain the lock on page, remove all ptes and migrate the page
587 * to the newly allocated page in newpage.
588 */
589static int unmap_and_move(new_page_t get_new_page, unsigned long private,
590 struct page *page, int force)
591{
592 int rc = 0;
593 int *result = NULL;
594 struct page *newpage = get_new_page(page, private, &result);
595
596 if (!newpage)
597 return -ENOMEM;
598
599 if (page_count(page) == 1)
600 /* page was freed from under us. So we are done. */
601 goto move_newpage;
602
603 rc = -EAGAIN;
604 if (TestSetPageLocked(page)) {
605 if (!force)
606 goto move_newpage;
607 lock_page(page);
608 }
609
610 if (PageWriteback(page)) {
611 if (!force)
612 goto unlock;
613 wait_on_page_writeback(page);
614 }
615
616 /*
617 * Establish migration ptes or remove ptes
618 */
619 if (try_to_unmap(page, 1) != SWAP_FAIL) {
620 if (!page_mapped(page))
621 rc = move_to_new_page(newpage, page);
622 } else
623 /* A vma has VM_LOCKED set -> permanent failure */
624 rc = -EPERM;
625
626 if (rc)
627 remove_migration_ptes(page, page);
628unlock:
629 unlock_page(page);
630
631 if (rc != -EAGAIN) {
632 /*
633 * A page that has been migrated has all references
634 * removed and will be freed. A page that has not been
635 * migrated will have kepts its references and be
636 * restored.
637 */
638 list_del(&page->lru);
639 move_to_lru(page);
640 }
641
642move_newpage:
643 /*
644 * Move the new page to the LRU. If migration was not successful
645 * then this will free the page.
646 */
647 move_to_lru(newpage);
648 if (result) {
649 if (rc)
650 *result = rc;
651 else
652 *result = page_to_nid(newpage);
653 }
654 return rc;
329} 655}
330EXPORT_SYMBOL(migrate_page);
331 656
332/* 657/*
333 * migrate_pages 658 * migrate_pages
334 * 659 *
335 * Two lists are passed to this function. The first list 660 * The function takes one list of pages to migrate and a function
336 * contains the pages isolated from the LRU to be migrated. 661 * that determines from the page to be migrated and the private data
337 * The second list contains new pages that the pages isolated 662 * the target of the move and allocates the page.
338 * can be moved to. If the second list is NULL then all
339 * pages are swapped out.
340 * 663 *
341 * The function returns after 10 attempts or if no pages 664 * The function returns after 10 attempts or if no pages
342 * are movable anymore because to has become empty 665 * are movable anymore because to has become empty
343 * or no retryable pages exist anymore. 666 * or no retryable pages exist anymore. All pages will be
667 * retruned to the LRU or freed.
344 * 668 *
345 * Return: Number of pages not migrated when "to" ran empty. 669 * Return: Number of pages not migrated or error code.
346 */ 670 */
347int migrate_pages(struct list_head *from, struct list_head *to, 671int migrate_pages(struct list_head *from,
348 struct list_head *moved, struct list_head *failed) 672 new_page_t get_new_page, unsigned long private)
349{ 673{
350 int retry; 674 int retry = 1;
351 int nr_failed = 0; 675 int nr_failed = 0;
352 int pass = 0; 676 int pass = 0;
353 struct page *page; 677 struct page *page;
@@ -358,305 +682,297 @@ int migrate_pages(struct list_head *from, struct list_head *to,
358 if (!swapwrite) 682 if (!swapwrite)
359 current->flags |= PF_SWAPWRITE; 683 current->flags |= PF_SWAPWRITE;
360 684
361redo: 685 for(pass = 0; pass < 10 && retry; pass++) {
362 retry = 0; 686 retry = 0;
687
688 list_for_each_entry_safe(page, page2, from, lru) {
689 cond_resched();
690
691 rc = unmap_and_move(get_new_page, private,
692 page, pass > 2);
693
694 switch(rc) {
695 case -ENOMEM:
696 goto out;
697 case -EAGAIN:
698 retry++;
699 break;
700 case 0:
701 break;
702 default:
703 /* Permanent failure */
704 nr_failed++;
705 break;
706 }
707 }
708 }
709 rc = 0;
710out:
711 if (!swapwrite)
712 current->flags &= ~PF_SWAPWRITE;
363 713
364 list_for_each_entry_safe(page, page2, from, lru) { 714 putback_lru_pages(from);
365 struct page *newpage = NULL;
366 struct address_space *mapping;
367 715
368 cond_resched(); 716 if (rc)
717 return rc;
369 718
370 rc = 0; 719 return nr_failed + retry;
371 if (page_count(page) == 1) 720}
372 /* page was freed from under us. So we are done. */
373 goto next;
374 721
375 if (to && list_empty(to)) 722#ifdef CONFIG_NUMA
376 break; 723/*
724 * Move a list of individual pages
725 */
726struct page_to_node {
727 unsigned long addr;
728 struct page *page;
729 int node;
730 int status;
731};
377 732
378 /* 733static struct page *new_page_node(struct page *p, unsigned long private,
379 * Skip locked pages during the first two passes to give the 734 int **result)
380 * functions holding the lock time to release the page. Later we 735{
381 * use lock_page() to have a higher chance of acquiring the 736 struct page_to_node *pm = (struct page_to_node *)private;
382 * lock.
383 */
384 rc = -EAGAIN;
385 if (pass > 2)
386 lock_page(page);
387 else
388 if (TestSetPageLocked(page))
389 goto next;
390 737
391 /* 738 while (pm->node != MAX_NUMNODES && pm->page != p)
392 * Only wait on writeback if we have already done a pass where 739 pm++;
393 * we we may have triggered writeouts for lots of pages.
394 */
395 if (pass > 0) {
396 wait_on_page_writeback(page);
397 } else {
398 if (PageWriteback(page))
399 goto unlock_page;
400 }
401 740
402 /* 741 if (pm->node == MAX_NUMNODES)
403 * Anonymous pages must have swap cache references otherwise 742 return NULL;
404 * the information contained in the page maps cannot be
405 * preserved.
406 */
407 if (PageAnon(page) && !PageSwapCache(page)) {
408 if (!add_to_swap(page, GFP_KERNEL)) {
409 rc = -ENOMEM;
410 goto unlock_page;
411 }
412 }
413 743
414 if (!to) { 744 *result = &pm->status;
415 rc = swap_page(page);
416 goto next;
417 }
418 745
419 newpage = lru_to_page(to); 746 return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
420 lock_page(newpage); 747}
421 748
422 /* 749/*
423 * Pages are properly locked and writeback is complete. 750 * Move a set of pages as indicated in the pm array. The addr
424 * Try to migrate the page. 751 * field must be set to the virtual address of the page to be moved
425 */ 752 * and the node number must contain a valid target node.
426 mapping = page_mapping(page); 753 */
427 if (!mapping) 754static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
428 goto unlock_both; 755 int migrate_all)
756{
757 int err;
758 struct page_to_node *pp;
759 LIST_HEAD(pagelist);
429 760
430 if (mapping->a_ops->migratepage) { 761 down_read(&mm->mmap_sem);
431 /*
432 * Most pages have a mapping and most filesystems
433 * should provide a migration function. Anonymous
434 * pages are part of swap space which also has its
435 * own migration function. This is the most common
436 * path for page migration.
437 */
438 rc = mapping->a_ops->migratepage(newpage, page);
439 goto unlock_both;
440 }
441
442 /* Make sure the dirty bit is up to date */
443 if (try_to_unmap(page, 1) == SWAP_FAIL) {
444 rc = -EPERM;
445 goto unlock_both;
446 }
447 762
448 if (page_mapcount(page)) { 763 /*
449 rc = -EAGAIN; 764 * Build a list of pages to migrate
450 goto unlock_both; 765 */
451 } 766 migrate_prep();
767 for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
768 struct vm_area_struct *vma;
769 struct page *page;
452 770
453 /* 771 /*
454 * Default handling if a filesystem does not provide 772 * A valid page pointer that will not match any of the
455 * a migration function. We can only migrate clean 773 * pages that will be moved.
456 * pages so try to write out any dirty pages first.
457 */ 774 */
458 if (PageDirty(page)) { 775 pp->page = ZERO_PAGE(0);
459 switch (pageout(page, mapping)) {
460 case PAGE_KEEP:
461 case PAGE_ACTIVATE:
462 goto unlock_both;
463
464 case PAGE_SUCCESS:
465 unlock_page(newpage);
466 goto next;
467
468 case PAGE_CLEAN:
469 ; /* try to migrate the page below */
470 }
471 }
472 776
473 /* 777 err = -EFAULT;
474 * Buffers are managed in a filesystem specific way. 778 vma = find_vma(mm, pp->addr);
475 * We must have no buffers or drop them. 779 if (!vma)
476 */ 780 goto set_status;
477 if (!page_has_buffers(page) ||
478 try_to_release_page(page, GFP_KERNEL)) {
479 rc = migrate_page(newpage, page);
480 goto unlock_both;
481 }
482 781
483 /* 782 page = follow_page(vma, pp->addr, FOLL_GET);
484 * On early passes with mapped pages simply 783 err = -ENOENT;
485 * retry. There may be a lock held for some 784 if (!page)
486 * buffers that may go away. Later 785 goto set_status;
487 * swap them out. 786
488 */ 787 if (PageReserved(page)) /* Check for zero page */
489 if (pass > 4) { 788 goto put_and_set;
789
790 pp->page = page;
791 err = page_to_nid(page);
792
793 if (err == pp->node)
490 /* 794 /*
491 * Persistently unable to drop buffers..... As a 795 * Node already in the right place
492 * measure of last resort we fall back to
493 * swap_page().
494 */ 796 */
495 unlock_page(newpage); 797 goto put_and_set;
496 newpage = NULL;
497 rc = swap_page(page);
498 goto next;
499 }
500 798
501unlock_both: 799 err = -EACCES;
502 unlock_page(newpage); 800 if (page_mapcount(page) > 1 &&
503 801 !migrate_all)
504unlock_page: 802 goto put_and_set;
505 unlock_page(page); 803
506 804 err = isolate_lru_page(page, &pagelist);
507next: 805put_and_set:
508 if (rc == -EAGAIN) { 806 /*
509 retry++; 807 * Either remove the duplicate refcount from
510 } else if (rc) { 808 * isolate_lru_page() or drop the page ref if it was
511 /* Permanent failure */ 809 * not isolated.
512 list_move(&page->lru, failed); 810 */
513 nr_failed++; 811 put_page(page);
514 } else { 812set_status:
515 if (newpage) { 813 pp->status = err;
516 /* Successful migration. Return page to LRU */
517 move_to_lru(newpage);
518 }
519 list_move(&page->lru, moved);
520 }
521 } 814 }
522 if (retry && pass++ < 10)
523 goto redo;
524 815
525 if (!swapwrite) 816 if (!list_empty(&pagelist))
526 current->flags &= ~PF_SWAPWRITE; 817 err = migrate_pages(&pagelist, new_page_node,
818 (unsigned long)pm);
819 else
820 err = -ENOENT;
527 821
528 return nr_failed + retry; 822 up_read(&mm->mmap_sem);
823 return err;
529} 824}
530 825
531/* 826/*
532 * Migration function for pages with buffers. This function can only be used 827 * Determine the nodes of a list of pages. The addr in the pm array
533 * if the underlying filesystem guarantees that no other references to "page" 828 * must have been set to the virtual address of which we want to determine
534 * exist. 829 * the node number.
535 */ 830 */
536int buffer_migrate_page(struct page *newpage, struct page *page) 831static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
537{ 832{
538 struct address_space *mapping = page->mapping; 833 down_read(&mm->mmap_sem);
539 struct buffer_head *bh, *head; 834
540 int rc; 835 for ( ; pm->node != MAX_NUMNODES; pm++) {
836 struct vm_area_struct *vma;
837 struct page *page;
838 int err;
839
840 err = -EFAULT;
841 vma = find_vma(mm, pm->addr);
842 if (!vma)
843 goto set_status;
844
845 page = follow_page(vma, pm->addr, 0);
846 err = -ENOENT;
847 /* Use PageReserved to check for zero page */
848 if (!page || PageReserved(page))
849 goto set_status;
850
851 err = page_to_nid(page);
852set_status:
853 pm->status = err;
854 }
541 855
542 if (!mapping) 856 up_read(&mm->mmap_sem);
543 return -EAGAIN; 857 return 0;
858}
544 859
545 if (!page_has_buffers(page)) 860/*
546 return migrate_page(newpage, page); 861 * Move a list of pages in the address space of the currently executing
862 * process.
863 */
864asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
865 const void __user * __user *pages,
866 const int __user *nodes,
867 int __user *status, int flags)
868{
869 int err = 0;
870 int i;
871 struct task_struct *task;
872 nodemask_t task_nodes;
873 struct mm_struct *mm;
874 struct page_to_node *pm = NULL;
547 875
548 head = page_buffers(page); 876 /* Check flags */
877 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
878 return -EINVAL;
549 879
550 rc = migrate_page_remove_references(newpage, page, 3); 880 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
881 return -EPERM;
551 882
552 if (rc) 883 /* Find the mm_struct */
553 return rc; 884 read_lock(&tasklist_lock);
885 task = pid ? find_task_by_pid(pid) : current;
886 if (!task) {
887 read_unlock(&tasklist_lock);
888 return -ESRCH;
889 }
890 mm = get_task_mm(task);
891 read_unlock(&tasklist_lock);
554 892
555 bh = head; 893 if (!mm)
556 do { 894 return -EINVAL;
557 get_bh(bh);
558 lock_buffer(bh);
559 bh = bh->b_this_page;
560 895
561 } while (bh != head); 896 /*
897 * Check if this process has the right to modify the specified
898 * process. The right exists if the process has administrative
899 * capabilities, superuser privileges or the same
900 * userid as the target process.
901 */
902 if ((current->euid != task->suid) && (current->euid != task->uid) &&
903 (current->uid != task->suid) && (current->uid != task->uid) &&
904 !capable(CAP_SYS_NICE)) {
905 err = -EPERM;
906 goto out2;
907 }
562 908
563 ClearPagePrivate(page); 909 err = security_task_movememory(task);
564 set_page_private(newpage, page_private(page)); 910 if (err)
565 set_page_private(page, 0); 911 goto out2;
566 put_page(page);
567 get_page(newpage);
568 912
569 bh = head;
570 do {
571 set_bh_page(bh, newpage, bh_offset(bh));
572 bh = bh->b_this_page;
573 913
574 } while (bh != head); 914 task_nodes = cpuset_mems_allowed(task);
575 915
576 SetPagePrivate(newpage); 916 /* Limit nr_pages so that the multiplication may not overflow */
917 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
918 err = -E2BIG;
919 goto out2;
920 }
577 921
578 migrate_page_copy(newpage, page); 922 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
923 if (!pm) {
924 err = -ENOMEM;
925 goto out2;
926 }
579 927
580 bh = head; 928 /*
581 do { 929 * Get parameters from user space and initialize the pm
582 unlock_buffer(bh); 930 * array. Return various errors if the user did something wrong.
583 put_bh(bh); 931 */
584 bh = bh->b_this_page; 932 for (i = 0; i < nr_pages; i++) {
933 const void *p;
585 934
586 } while (bh != head); 935 err = -EFAULT;
936 if (get_user(p, pages + i))
937 goto out;
587 938
588 return 0; 939 pm[i].addr = (unsigned long)p;
589} 940 if (nodes) {
590EXPORT_SYMBOL(buffer_migrate_page); 941 int node;
591 942
592/* 943 if (get_user(node, nodes + i))
593 * Migrate the list 'pagelist' of pages to a certain destination. 944 goto out;
594 *
595 * Specify destination with either non-NULL vma or dest_node >= 0
596 * Return the number of pages not migrated or error code
597 */
598int migrate_pages_to(struct list_head *pagelist,
599 struct vm_area_struct *vma, int dest)
600{
601 LIST_HEAD(newlist);
602 LIST_HEAD(moved);
603 LIST_HEAD(failed);
604 int err = 0;
605 unsigned long offset = 0;
606 int nr_pages;
607 struct page *page;
608 struct list_head *p;
609 945
610redo: 946 err = -ENODEV;
611 nr_pages = 0; 947 if (!node_online(node))
612 list_for_each(p, pagelist) { 948 goto out;
613 if (vma) {
614 /*
615 * The address passed to alloc_page_vma is used to
616 * generate the proper interleave behavior. We fake
617 * the address here by an increasing offset in order
618 * to get the proper distribution of pages.
619 *
620 * No decision has been made as to which page
621 * a certain old page is moved to so we cannot
622 * specify the correct address.
623 */
624 page = alloc_page_vma(GFP_HIGHUSER, vma,
625 offset + vma->vm_start);
626 offset += PAGE_SIZE;
627 }
628 else
629 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
630 949
631 if (!page) { 950 err = -EACCES;
632 err = -ENOMEM; 951 if (!node_isset(node, task_nodes))
633 goto out; 952 goto out;
953
954 pm[i].node = node;
634 } 955 }
635 list_add_tail(&page->lru, &newlist);
636 nr_pages++;
637 if (nr_pages > MIGRATE_CHUNK_SIZE)
638 break;
639 } 956 }
640 err = migrate_pages(pagelist, &newlist, &moved, &failed); 957 /* End marker */
958 pm[nr_pages].node = MAX_NUMNODES;
959
960 if (nodes)
961 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
962 else
963 err = do_pages_stat(mm, pm);
641 964
642 putback_lru_pages(&moved); /* Call release pages instead ?? */ 965 if (err >= 0)
966 /* Return status information */
967 for (i = 0; i < nr_pages; i++)
968 if (put_user(pm[i].status, status + i))
969 err = -EFAULT;
643 970
644 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
645 goto redo;
646out: 971out:
647 /* Return leftover allocated pages */ 972 vfree(pm);
648 while (!list_empty(&newlist)) { 973out2:
649 page = list_entry(newlist.next, struct page, lru); 974 mmput(mm);
650 list_del(&page->lru); 975 return err;
651 __free_page(page);
652 }
653 list_splice(&failed, pagelist);
654 if (err < 0)
655 return err;
656
657 /* Calculate number of leftover pages */
658 nr_pages = 0;
659 list_for_each(p, pagelist)
660 nr_pages++;
661 return nr_pages;
662} 976}
977#endif
978
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..6446c6134b04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
1065 vma->vm_start = addr; 1065 vma->vm_start = addr;
1066 vma->vm_end = addr + len; 1066 vma->vm_end = addr + len;
1067 vma->vm_flags = vm_flags; 1067 vma->vm_flags = vm_flags;
1068 vma->vm_page_prot = protection_map[vm_flags & 0x0f]; 1068 vma->vm_page_prot = protection_map[vm_flags &
1069 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1069 vma->vm_pgoff = pgoff; 1070 vma->vm_pgoff = pgoff;
1070 1071
1071 if (file) { 1072 if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
1089 goto free_vma; 1090 goto free_vma;
1090 } 1091 }
1091 1092
1093 /* Don't make the VMA automatically writable if it's shared, but the
1094 * backer wishes to know when pages are first written to */
1095 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1096 vma->vm_page_prot =
1097 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1098
1092 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1099 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1093 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1100 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1094 * that memory reservation must be checked; but that reservation 1101 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1921 vma->vm_end = addr + len; 1928 vma->vm_end = addr + len;
1922 vma->vm_pgoff = pgoff; 1929 vma->vm_pgoff = pgoff;
1923 vma->vm_flags = flags; 1930 vma->vm_flags = flags;
1924 vma->vm_page_prot = protection_map[flags & 0x0f]; 1931 vma->vm_page_prot = protection_map[flags &
1932 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1925 vma_link(mm, vma, prev, rb_link, rb_parent); 1933 vma_link(mm, vma, prev, rb_link, rb_parent);
1926out: 1934out:
1927 mm->total_vm += len >> PAGE_SHIFT; 1935 mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d4289b61..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
19#include <linux/mempolicy.h> 19#include <linux/mempolicy.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22 22#include <linux/swap.h>
23#include <linux/swapops.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
28static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
29 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot)
30{ 31{
31 pte_t *pte; 32 pte_t *pte, oldpte;
32 spinlock_t *ptl; 33 spinlock_t *ptl;
33 34
34 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 35 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
35 do { 36 do {
36 if (pte_present(*pte)) { 37 oldpte = *pte;
38 if (pte_present(oldpte)) {
37 pte_t ptent; 39 pte_t ptent;
38 40
39 /* Avoid an SMP race with hardware updated dirty/clean 41 /* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
44 set_pte_at(mm, addr, pte, ptent); 46 set_pte_at(mm, addr, pte, ptent);
45 lazy_mmu_prot_update(ptent); 47 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION
49 } else if (!pte_file(oldpte)) {
50 swp_entry_t entry = pte_to_swp_entry(oldpte);
51
52 if (is_write_migration_entry(entry)) {
53 /*
54 * A protection check is difficult so
55 * just be safe and disable write
56 */
57 make_migration_entry_read(&entry);
58 set_pte_at(mm, addr, pte,
59 swp_entry_to_pte(entry));
60 }
61#endif
46 } 62 }
63
47 } while (pte++, addr += PAGE_SIZE, addr != end); 64 } while (pte++, addr += PAGE_SIZE, addr != end);
48 pte_unmap_unlock(pte - 1, ptl); 65 pte_unmap_unlock(pte - 1, ptl);
49} 66}
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
106 unsigned long oldflags = vma->vm_flags; 123 unsigned long oldflags = vma->vm_flags;
107 long nrpages = (end - start) >> PAGE_SHIFT; 124 long nrpages = (end - start) >> PAGE_SHIFT;
108 unsigned long charged = 0; 125 unsigned long charged = 0;
126 unsigned int mask;
109 pgprot_t newprot; 127 pgprot_t newprot;
110 pgoff_t pgoff; 128 pgoff_t pgoff;
111 int error; 129 int error;
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
132 } 150 }
133 } 151 }
134 152
135 newprot = protection_map[newflags & 0xf];
136
137 /* 153 /*
138 * First try to merge with previous and/or next vma. 154 * First try to merge with previous and/or next vma.
139 */ 155 */
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
160 } 176 }
161 177
162success: 178success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
163 /* 187 /*
164 * vm_flags and vm_page_prot are protected by the mmap_sem 188 * vm_flags and vm_page_prot are protected by the mmap_sem
165 * held in write mode. 189 * held in write mode.
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
205 /* 229 /*
206 * Does the application expect PROT_READ to imply PROT_EXEC: 230 * Does the application expect PROT_READ to imply PROT_EXEC:
207 */ 231 */
208 if (unlikely((prot & PROT_READ) && 232 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
209 (current->personality & READ_IMPLIES_EXEC)))
210 prot |= PROT_EXEC; 233 prot |= PROT_EXEC;
211 234
212 vm_flags = calc_vm_prot_bits(prot); 235 vm_flags = calc_vm_prot_bits(prot);
diff --git a/mm/msync.c b/mm/msync.c
index bc6c95376366..d083544df21b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
170 * just ignore them, but return -ENOMEM at the end. 170 * just ignore them, but return -ENOMEM at the end.
171 */ 171 */
172 down_read(&current->mm->mmap_sem); 172 down_read(&current->mm->mmap_sem);
173 if (flags & MS_SYNC)
174 current->flags |= PF_SYNCWRITE;
175 vma = find_vma(current->mm, start); 173 vma = find_vma(current->mm, start);
176 if (!vma) { 174 if (!vma) {
177 error = -ENOMEM; 175 error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
228 } 226 }
229 } while (vma && !done); 227 } while (vma && !done);
230out_unlock: 228out_unlock:
231 current->flags &= ~PF_SYNCWRITE;
232 up_read(&current->mm->mmap_sem); 229 up_read(&current->mm->mmap_sem);
233out: 230out:
234 return error; 231 return error;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3ee..d46ed0f1dc06 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,10 +22,11 @@
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24 24
25int sysctl_panic_on_oom;
25/* #define DEBUG */ 26/* #define DEBUG */
26 27
27/** 28/**
28 * oom_badness - calculate a numeric value for how bad this task has been 29 * badness - calculate a numeric value for how bad this task has been
29 * @p: task struct of which task we should calculate 30 * @p: task struct of which task we should calculate
30 * @uptime: current uptime in seconds 31 * @uptime: current uptime in seconds
31 * 32 *
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
200 continue; 201 continue;
201 202
202 /* 203 /*
203 * This is in the process of releasing memory so for wait it 204 * This is in the process of releasing memory so wait for it
204 * to finish before killing some other task by mistake. 205 * to finish before killing some other task by mistake.
205 */ 206 */
206 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || 207 releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
306} 307}
307 308
308/** 309/**
309 * oom_kill - kill the "best" process when we run out of memory 310 * out_of_memory - kill the "best" process when we run out of memory
310 * 311 *
311 * If we run out of memory, we have the choice between either 312 * If we run out of memory, we have the choice between either
312 * killing a random task (bad), letting the system crash (worse) 313 * killing a random task (bad), letting the system crash (worse)
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
344 break; 345 break;
345 346
346 case CONSTRAINT_NONE: 347 case CONSTRAINT_NONE:
348 if (sysctl_panic_on_oom)
349 panic("out of memory. panic_on_oom is selected\n");
347retry: 350retry:
348 /* 351 /*
349 * Rambo mode: Shoot down a process and hope it solves whatever 352 * Rambo mode: Shoot down a process and hope it solves whatever
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79bb..8ccf6f1b1473 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping)
204 .sync_mode = WB_SYNC_NONE, 204 .sync_mode = WB_SYNC_NONE,
205 .older_than_this = NULL, 205 .older_than_this = NULL,
206 .nr_to_write = write_chunk, 206 .nr_to_write = write_chunk,
207 .range_cyclic = 1,
207 }; 208 };
208 209
209 get_dirty_limits(&wbs, &background_thresh, 210 get_dirty_limits(&wbs, &background_thresh,
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages)
331 .older_than_this = NULL, 332 .older_than_this = NULL,
332 .nr_to_write = 0, 333 .nr_to_write = 0,
333 .nonblocking = 1, 334 .nonblocking = 1,
335 .range_cyclic = 1,
334 }; 336 };
335 337
336 for ( ; ; ) { 338 for ( ; ; ) {
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg)
407 .nr_to_write = 0, 409 .nr_to_write = 0,
408 .nonblocking = 1, 410 .nonblocking = 1,
409 .for_kupdate = 1, 411 .for_kupdate = 1,
412 .range_cyclic = 1,
410 }; 413 };
411 414
412 sync_supers(); 415 sync_supers();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..423db0db7c02 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,7 @@
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h> 39#include <linux/mempolicy.h>
40#include <linux/stop_machine.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42#include <asm/div64.h> 43#include <asm/div64.h>
@@ -83,8 +84,8 @@ EXPORT_SYMBOL(zone_table);
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 84static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
84int min_free_kbytes = 1024; 85int min_free_kbytes = 1024;
85 86
86unsigned long __initdata nr_kernel_pages; 87unsigned long __meminitdata nr_kernel_pages;
87unsigned long __initdata nr_all_pages; 88unsigned long __meminitdata nr_all_pages;
88 89
89#ifdef CONFIG_DEBUG_VM 90#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 91static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -286,22 +287,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
286 * we can do coalesce a page and its buddy if 287 * we can do coalesce a page and its buddy if
287 * (a) the buddy is not in a hole && 288 * (a) the buddy is not in a hole &&
288 * (b) the buddy is in the buddy system && 289 * (b) the buddy is in the buddy system &&
289 * (c) a page and its buddy have the same order. 290 * (c) a page and its buddy have the same order &&
291 * (d) a page and its buddy are in the same zone.
290 * 292 *
291 * For recording whether a page is in the buddy system, we use PG_buddy. 293 * For recording whether a page is in the buddy system, we use PG_buddy.
292 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 294 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
293 * 295 *
294 * For recording page's order, we use page_private(page). 296 * For recording page's order, we use page_private(page).
295 */ 297 */
296static inline int page_is_buddy(struct page *page, int order) 298static inline int page_is_buddy(struct page *page, struct page *buddy,
299 int order)
297{ 300{
298#ifdef CONFIG_HOLES_IN_ZONE 301#ifdef CONFIG_HOLES_IN_ZONE
299 if (!pfn_valid(page_to_pfn(page))) 302 if (!pfn_valid(page_to_pfn(buddy)))
300 return 0; 303 return 0;
301#endif 304#endif
302 305
303 if (PageBuddy(page) && page_order(page) == order) { 306 if (page_zone_id(page) != page_zone_id(buddy))
304 BUG_ON(page_count(page) != 0); 307 return 0;
308
309 if (PageBuddy(buddy) && page_order(buddy) == order) {
310 BUG_ON(page_count(buddy) != 0);
305 return 1; 311 return 1;
306 } 312 }
307 return 0; 313 return 0;
@@ -352,7 +358,7 @@ static inline void __free_one_page(struct page *page,
352 struct page *buddy; 358 struct page *buddy;
353 359
354 buddy = __page_find_buddy(page, page_idx, order); 360 buddy = __page_find_buddy(page, page_idx, order);
355 if (!page_is_buddy(buddy, order)) 361 if (!page_is_buddy(page, buddy, order))
356 break; /* Move the buddy up one level. */ 362 break; /* Move the buddy up one level. */
357 363
358 list_del(&buddy->lru); 364 list_del(&buddy->lru);
@@ -1485,7 +1491,7 @@ void show_free_areas(void)
1485 } 1491 }
1486 1492
1487 for_each_zone(zone) { 1493 for_each_zone(zone) {
1488 unsigned long nr, flags, order, total = 0; 1494 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1489 1495
1490 show_node(zone); 1496 show_node(zone);
1491 printk("%s: ", zone->name); 1497 printk("%s: ", zone->name);
@@ -1496,11 +1502,12 @@ void show_free_areas(void)
1496 1502
1497 spin_lock_irqsave(&zone->lock, flags); 1503 spin_lock_irqsave(&zone->lock, flags);
1498 for (order = 0; order < MAX_ORDER; order++) { 1504 for (order = 0; order < MAX_ORDER; order++) {
1499 nr = zone->free_area[order].nr_free; 1505 nr[order] = zone->free_area[order].nr_free;
1500 total += nr << order; 1506 total += nr[order] << order;
1501 printk("%lu*%lukB ", nr, K(1UL) << order);
1502 } 1507 }
1503 spin_unlock_irqrestore(&zone->lock, flags); 1508 spin_unlock_irqrestore(&zone->lock, flags);
1509 for (order = 0; order < MAX_ORDER; order++)
1510 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1504 printk("= %lukB\n", K(total)); 1511 printk("= %lukB\n", K(total));
1505 } 1512 }
1506 1513
@@ -1512,7 +1519,7 @@ void show_free_areas(void)
1512 * 1519 *
1513 * Add all populated zones of a node to the zonelist. 1520 * Add all populated zones of a node to the zonelist.
1514 */ 1521 */
1515static int __init build_zonelists_node(pg_data_t *pgdat, 1522static int __meminit build_zonelists_node(pg_data_t *pgdat,
1516 struct zonelist *zonelist, int nr_zones, int zone_type) 1523 struct zonelist *zonelist, int nr_zones, int zone_type)
1517{ 1524{
1518 struct zone *zone; 1525 struct zone *zone;
@@ -1548,7 +1555,7 @@ static inline int highest_zone(int zone_bits)
1548 1555
1549#ifdef CONFIG_NUMA 1556#ifdef CONFIG_NUMA
1550#define MAX_NODE_LOAD (num_online_nodes()) 1557#define MAX_NODE_LOAD (num_online_nodes())
1551static int __initdata node_load[MAX_NUMNODES]; 1558static int __meminitdata node_load[MAX_NUMNODES];
1552/** 1559/**
1553 * find_next_best_node - find the next node that should appear in a given node's fallback list 1560 * find_next_best_node - find the next node that should appear in a given node's fallback list
1554 * @node: node whose fallback list we're appending 1561 * @node: node whose fallback list we're appending
@@ -1563,7 +1570,7 @@ static int __initdata node_load[MAX_NUMNODES];
1563 * on them otherwise. 1570 * on them otherwise.
1564 * It returns -1 if no node is found. 1571 * It returns -1 if no node is found.
1565 */ 1572 */
1566static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1573static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1567{ 1574{
1568 int n, val; 1575 int n, val;
1569 int min_val = INT_MAX; 1576 int min_val = INT_MAX;
@@ -1609,7 +1616,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1609 return best_node; 1616 return best_node;
1610} 1617}
1611 1618
1612static void __init build_zonelists(pg_data_t *pgdat) 1619static void __meminit build_zonelists(pg_data_t *pgdat)
1613{ 1620{
1614 int i, j, k, node, local_node; 1621 int i, j, k, node, local_node;
1615 int prev_node, load; 1622 int prev_node, load;
@@ -1661,7 +1668,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
1661 1668
1662#else /* CONFIG_NUMA */ 1669#else /* CONFIG_NUMA */
1663 1670
1664static void __init build_zonelists(pg_data_t *pgdat) 1671static void __meminit build_zonelists(pg_data_t *pgdat)
1665{ 1672{
1666 int i, j, k, node, local_node; 1673 int i, j, k, node, local_node;
1667 1674
@@ -1699,14 +1706,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
1699 1706
1700#endif /* CONFIG_NUMA */ 1707#endif /* CONFIG_NUMA */
1701 1708
1702void __init build_all_zonelists(void) 1709/* return values int ....just for stop_machine_run() */
1710static int __meminit __build_all_zonelists(void *dummy)
1703{ 1711{
1704 int i; 1712 int nid;
1713 for_each_online_node(nid)
1714 build_zonelists(NODE_DATA(nid));
1715 return 0;
1716}
1705 1717
1706 for_each_online_node(i) 1718void __meminit build_all_zonelists(void)
1707 build_zonelists(NODE_DATA(i)); 1719{
1708 printk("Built %i zonelists\n", num_online_nodes()); 1720 if (system_state == SYSTEM_BOOTING) {
1709 cpuset_init_current_mems_allowed(); 1721 __build_all_zonelists(0);
1722 cpuset_init_current_mems_allowed();
1723 } else {
1724 /* we have to stop all cpus to guaranntee there is no user
1725 of zonelist */
1726 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1727 /* cpuset refresh routine should be here */
1728 }
1729 vm_total_pages = nr_free_pagecache_pages();
1730 printk("Built %i zonelists. Total pages: %ld\n",
1731 num_online_nodes(), vm_total_pages);
1710} 1732}
1711 1733
1712/* 1734/*
@@ -1722,7 +1744,8 @@ void __init build_all_zonelists(void)
1722 */ 1744 */
1723#define PAGES_PER_WAITQUEUE 256 1745#define PAGES_PER_WAITQUEUE 256
1724 1746
1725static inline unsigned long wait_table_size(unsigned long pages) 1747#ifndef CONFIG_MEMORY_HOTPLUG
1748static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1726{ 1749{
1727 unsigned long size = 1; 1750 unsigned long size = 1;
1728 1751
@@ -1740,6 +1763,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
1740 1763
1741 return max(size, 4UL); 1764 return max(size, 4UL);
1742} 1765}
1766#else
1767/*
1768 * A zone's size might be changed by hot-add, so it is not possible to determine
1769 * a suitable size for its wait_table. So we use the maximum size now.
1770 *
1771 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1772 *
1773 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
1774 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1775 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1776 *
1777 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1778 * or more by the traditional way. (See above). It equals:
1779 *
1780 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1781 * ia64(16K page size) : = ( 8G + 4M)byte.
1782 * powerpc (64K page size) : = (32G +16M)byte.
1783 */
1784static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1785{
1786 return 4096UL;
1787}
1788#endif
1743 1789
1744/* 1790/*
1745 * This is an integer logarithm so that shifts can be used later 1791 * This is an integer logarithm so that shifts can be used later
@@ -2005,23 +2051,46 @@ void __init setup_per_cpu_pageset(void)
2005#endif 2051#endif
2006 2052
2007static __meminit 2053static __meminit
2008void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2054int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2009{ 2055{
2010 int i; 2056 int i;
2011 struct pglist_data *pgdat = zone->zone_pgdat; 2057 struct pglist_data *pgdat = zone->zone_pgdat;
2058 size_t alloc_size;
2012 2059
2013 /* 2060 /*
2014 * The per-page waitqueue mechanism uses hashed waitqueues 2061 * The per-page waitqueue mechanism uses hashed waitqueues
2015 * per zone. 2062 * per zone.
2016 */ 2063 */
2017 zone->wait_table_size = wait_table_size(zone_size_pages); 2064 zone->wait_table_hash_nr_entries =
2018 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 2065 wait_table_hash_nr_entries(zone_size_pages);
2019 zone->wait_table = (wait_queue_head_t *) 2066 zone->wait_table_bits =
2020 alloc_bootmem_node(pgdat, zone->wait_table_size 2067 wait_table_bits(zone->wait_table_hash_nr_entries);
2021 * sizeof(wait_queue_head_t)); 2068 alloc_size = zone->wait_table_hash_nr_entries
2069 * sizeof(wait_queue_head_t);
2070
2071 if (system_state == SYSTEM_BOOTING) {
2072 zone->wait_table = (wait_queue_head_t *)
2073 alloc_bootmem_node(pgdat, alloc_size);
2074 } else {
2075 /*
2076 * This case means that a zone whose size was 0 gets new memory
2077 * via memory hot-add.
2078 * But it may be the case that a new node was hot-added. In
2079 * this case vmalloc() will not be able to use this new node's
2080 * memory - this wait_table must be initialized to use this new
2081 * node itself as well.
2082 * To use this new node's memory, further consideration will be
2083 * necessary.
2084 */
2085 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
2086 }
2087 if (!zone->wait_table)
2088 return -ENOMEM;
2022 2089
2023 for(i = 0; i < zone->wait_table_size; ++i) 2090 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2024 init_waitqueue_head(zone->wait_table + i); 2091 init_waitqueue_head(zone->wait_table + i);
2092
2093 return 0;
2025} 2094}
2026 2095
2027static __meminit void zone_pcp_init(struct zone *zone) 2096static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +2112,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
2043 zone->name, zone->present_pages, batch); 2112 zone->name, zone->present_pages, batch);
2044} 2113}
2045 2114
2046static __meminit void init_currently_empty_zone(struct zone *zone, 2115__meminit int init_currently_empty_zone(struct zone *zone,
2047 unsigned long zone_start_pfn, unsigned long size) 2116 unsigned long zone_start_pfn,
2117 unsigned long size)
2048{ 2118{
2049 struct pglist_data *pgdat = zone->zone_pgdat; 2119 struct pglist_data *pgdat = zone->zone_pgdat;
2050 2120 int ret;
2051 zone_wait_table_init(zone, size); 2121 ret = zone_wait_table_init(zone, size);
2122 if (ret)
2123 return ret;
2052 pgdat->nr_zones = zone_idx(zone) + 1; 2124 pgdat->nr_zones = zone_idx(zone) + 1;
2053 2125
2054 zone->zone_start_pfn = zone_start_pfn; 2126 zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +2128,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2056 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2128 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2057 2129
2058 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2130 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2131
2132 return 0;
2059} 2133}
2060 2134
2061/* 2135/*
@@ -2064,12 +2138,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2064 * - mark all memory queues empty 2138 * - mark all memory queues empty
2065 * - clear the memory bitmaps 2139 * - clear the memory bitmaps
2066 */ 2140 */
2067static void __init free_area_init_core(struct pglist_data *pgdat, 2141static void __meminit free_area_init_core(struct pglist_data *pgdat,
2068 unsigned long *zones_size, unsigned long *zholes_size) 2142 unsigned long *zones_size, unsigned long *zholes_size)
2069{ 2143{
2070 unsigned long j; 2144 unsigned long j;
2071 int nid = pgdat->node_id; 2145 int nid = pgdat->node_id;
2072 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2146 unsigned long zone_start_pfn = pgdat->node_start_pfn;
2147 int ret;
2073 2148
2074 pgdat_resize_init(pgdat); 2149 pgdat_resize_init(pgdat);
2075 pgdat->nr_zones = 0; 2150 pgdat->nr_zones = 0;
@@ -2111,7 +2186,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
2111 continue; 2186 continue;
2112 2187
2113 zonetable_add(zone, nid, j, zone_start_pfn, size); 2188 zonetable_add(zone, nid, j, zone_start_pfn, size);
2114 init_currently_empty_zone(zone, zone_start_pfn, size); 2189 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2190 BUG_ON(ret);
2115 zone_start_pfn += size; 2191 zone_start_pfn += size;
2116 } 2192 }
2117} 2193}
@@ -2152,7 +2228,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2152#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2228#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2153} 2229}
2154 2230
2155void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2231void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2156 unsigned long *zones_size, unsigned long node_start_pfn, 2232 unsigned long *zones_size, unsigned long node_start_pfn,
2157 unsigned long *zholes_size) 2233 unsigned long *zholes_size)
2158{ 2234{
@@ -2804,42 +2880,14 @@ void *__init alloc_large_system_hash(const char *tablename,
2804} 2880}
2805 2881
2806#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 2882#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2807/*
2808 * pfn <-> page translation. out-of-line version.
2809 * (see asm-generic/memory_model.h)
2810 */
2811#if defined(CONFIG_FLATMEM)
2812struct page *pfn_to_page(unsigned long pfn) 2883struct page *pfn_to_page(unsigned long pfn)
2813{ 2884{
2814 return mem_map + (pfn - ARCH_PFN_OFFSET); 2885 return __pfn_to_page(pfn);
2815} 2886}
2816unsigned long page_to_pfn(struct page *page) 2887unsigned long page_to_pfn(struct page *page)
2817{ 2888{
2818 return (page - mem_map) + ARCH_PFN_OFFSET; 2889 return __page_to_pfn(page);
2819}
2820#elif defined(CONFIG_DISCONTIGMEM)
2821struct page *pfn_to_page(unsigned long pfn)
2822{
2823 int nid = arch_pfn_to_nid(pfn);
2824 return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
2825}
2826unsigned long page_to_pfn(struct page *page)
2827{
2828 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
2829 return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
2830}
2831#elif defined(CONFIG_SPARSEMEM)
2832struct page *pfn_to_page(unsigned long pfn)
2833{
2834 return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
2835}
2836
2837unsigned long page_to_pfn(struct page *page)
2838{
2839 long section_id = page_to_section(page);
2840 return page - __section_mem_map_addr(__nr_to_section(section_id));
2841} 2890}
2842#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
2843EXPORT_SYMBOL(pfn_to_page); 2891EXPORT_SYMBOL(pfn_to_page);
2844EXPORT_SYMBOL(page_to_pfn); 2892EXPORT_SYMBOL(page_to_pfn);
2845#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 2893#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/pdflush.c b/mm/pdflush.c
index c4b6d0afd736..df7e50b8f70c 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
202 unsigned long flags; 202 unsigned long flags;
203 int ret = 0; 203 int ret = 0;
204 204
205 if (fn == NULL) 205 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
206 BUG(); /* Hard to diagnose if it's deferred */
207 206
208 spin_lock_irqsave(&pdflush_lock, flags); 207 spin_lock_irqsave(&pdflush_lock, flags);
209 if (list_empty(&pdflush_list)) { 208 if (list_empty(&pdflush_list)) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 1963e269314d..882a85826bb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
103 spin_lock(&mm->page_table_lock); 103 spin_lock(&mm->page_table_lock);
104 if (likely(!vma->anon_vma)) { 104 if (likely(!vma->anon_vma)) {
105 vma->anon_vma = anon_vma; 105 vma->anon_vma = anon_vma;
106 list_add(&vma->anon_vma_node, &anon_vma->head); 106 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
107 allocated = NULL; 107 allocated = NULL;
108 } 108 }
109 spin_unlock(&mm->page_table_lock); 109 spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
127 struct anon_vma *anon_vma = vma->anon_vma; 127 struct anon_vma *anon_vma = vma->anon_vma;
128 128
129 if (anon_vma) { 129 if (anon_vma) {
130 list_add(&vma->anon_vma_node, &anon_vma->head); 130 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
131 validate_anon_vma(vma); 131 validate_anon_vma(vma);
132 } 132 }
133} 133}
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
138 138
139 if (anon_vma) { 139 if (anon_vma) {
140 spin_lock(&anon_vma->lock); 140 spin_lock(&anon_vma->lock);
141 list_add(&vma->anon_vma_node, &anon_vma->head); 141 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
142 validate_anon_vma(vma); 142 validate_anon_vma(vma);
143 spin_unlock(&anon_vma->lock); 143 spin_unlock(&anon_vma->lock);
144 } 144 }
@@ -205,44 +205,6 @@ out:
205 return anon_vma; 205 return anon_vma;
206} 206}
207 207
208#ifdef CONFIG_MIGRATION
209/*
210 * Remove an anonymous page from swap replacing the swap pte's
211 * through real pte's pointing to valid pages and then releasing
212 * the page from the swap cache.
213 *
214 * Must hold page lock on page and mmap_sem of one vma that contains
215 * the page.
216 */
217void remove_from_swap(struct page *page)
218{
219 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma;
221 unsigned long mapping;
222
223 if (!PageSwapCache(page))
224 return;
225
226 mapping = (unsigned long)page->mapping;
227
228 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
229 return;
230
231 /*
232 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
233 */
234 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
235 spin_lock(&anon_vma->lock);
236
237 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
238 remove_vma_swap(vma, page);
239
240 spin_unlock(&anon_vma->lock);
241 delete_from_swap_cache(page);
242}
243EXPORT_SYMBOL(remove_from_swap);
244#endif
245
246/* 208/*
247 * At what user virtual address is page expected in vma? 209 * At what user virtual address is page expected in vma?
248 */ 210 */
@@ -578,7 +540,7 @@ void page_remove_rmap(struct page *page)
578 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 540 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
579 */ 541 */
580static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 542static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
581 int ignore_refs) 543 int migration)
582{ 544{
583 struct mm_struct *mm = vma->vm_mm; 545 struct mm_struct *mm = vma->vm_mm;
584 unsigned long address; 546 unsigned long address;
@@ -602,7 +564,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
602 */ 564 */
603 if ((vma->vm_flags & VM_LOCKED) || 565 if ((vma->vm_flags & VM_LOCKED) ||
604 (ptep_clear_flush_young(vma, address, pte) 566 (ptep_clear_flush_young(vma, address, pte)
605 && !ignore_refs)) { 567 && !migration)) {
606 ret = SWAP_FAIL; 568 ret = SWAP_FAIL;
607 goto out_unmap; 569 goto out_unmap;
608 } 570 }
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
620 582
621 if (PageAnon(page)) { 583 if (PageAnon(page)) {
622 swp_entry_t entry = { .val = page_private(page) }; 584 swp_entry_t entry = { .val = page_private(page) };
623 /* 585
624 * Store the swap location in the pte. 586 if (PageSwapCache(page)) {
625 * See handle_pte_fault() ... 587 /*
626 */ 588 * Store the swap location in the pte.
627 BUG_ON(!PageSwapCache(page)); 589 * See handle_pte_fault() ...
628 swap_duplicate(entry); 590 */
629 if (list_empty(&mm->mmlist)) { 591 swap_duplicate(entry);
630 spin_lock(&mmlist_lock); 592 if (list_empty(&mm->mmlist)) {
631 if (list_empty(&mm->mmlist)) 593 spin_lock(&mmlist_lock);
632 list_add(&mm->mmlist, &init_mm.mmlist); 594 if (list_empty(&mm->mmlist))
633 spin_unlock(&mmlist_lock); 595 list_add(&mm->mmlist, &init_mm.mmlist);
596 spin_unlock(&mmlist_lock);
597 }
598 dec_mm_counter(mm, anon_rss);
599#ifdef CONFIG_MIGRATION
600 } else {
601 /*
602 * Store the pfn of the page in a special migration
603 * pte. do_swap_page() will wait until the migration
604 * pte is removed and then restart fault handling.
605 */
606 BUG_ON(!migration);
607 entry = make_migration_entry(page, pte_write(pteval));
608#endif
634 } 609 }
635 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 610 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
636 BUG_ON(pte_file(*pte)); 611 BUG_ON(pte_file(*pte));
637 dec_mm_counter(mm, anon_rss);
638 } else 612 } else
613#ifdef CONFIG_MIGRATION
614 if (migration) {
615 /* Establish migration entry for a file page */
616 swp_entry_t entry;
617 entry = make_migration_entry(page, pte_write(pteval));
618 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
619 } else
620#endif
639 dec_mm_counter(mm, file_rss); 621 dec_mm_counter(mm, file_rss);
640 622
623
641 page_remove_rmap(page); 624 page_remove_rmap(page);
642 page_cache_release(page); 625 page_cache_release(page);
643 626
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
736 pte_unmap_unlock(pte - 1, ptl); 719 pte_unmap_unlock(pte - 1, ptl);
737} 720}
738 721
739static int try_to_unmap_anon(struct page *page, int ignore_refs) 722static int try_to_unmap_anon(struct page *page, int migration)
740{ 723{
741 struct anon_vma *anon_vma; 724 struct anon_vma *anon_vma;
742 struct vm_area_struct *vma; 725 struct vm_area_struct *vma;
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
747 return ret; 730 return ret;
748 731
749 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 732 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
750 ret = try_to_unmap_one(page, vma, ignore_refs); 733 ret = try_to_unmap_one(page, vma, migration);
751 if (ret == SWAP_FAIL || !page_mapped(page)) 734 if (ret == SWAP_FAIL || !page_mapped(page))
752 break; 735 break;
753 } 736 }
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
764 * 747 *
765 * This function is only called from try_to_unmap for object-based pages. 748 * This function is only called from try_to_unmap for object-based pages.
766 */ 749 */
767static int try_to_unmap_file(struct page *page, int ignore_refs) 750static int try_to_unmap_file(struct page *page, int migration)
768{ 751{
769 struct address_space *mapping = page->mapping; 752 struct address_space *mapping = page->mapping;
770 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 753 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
778 761
779 spin_lock(&mapping->i_mmap_lock); 762 spin_lock(&mapping->i_mmap_lock);
780 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 763 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
781 ret = try_to_unmap_one(page, vma, ignore_refs); 764 ret = try_to_unmap_one(page, vma, migration);
782 if (ret == SWAP_FAIL || !page_mapped(page)) 765 if (ret == SWAP_FAIL || !page_mapped(page))
783 goto out; 766 goto out;
784 } 767 }
@@ -863,16 +846,16 @@ out:
863 * SWAP_AGAIN - we missed a mapping, try again later 846 * SWAP_AGAIN - we missed a mapping, try again later
864 * SWAP_FAIL - the page is unswappable 847 * SWAP_FAIL - the page is unswappable
865 */ 848 */
866int try_to_unmap(struct page *page, int ignore_refs) 849int try_to_unmap(struct page *page, int migration)
867{ 850{
868 int ret; 851 int ret;
869 852
870 BUG_ON(!PageLocked(page)); 853 BUG_ON(!PageLocked(page));
871 854
872 if (PageAnon(page)) 855 if (PageAnon(page))
873 ret = try_to_unmap_anon(page, ignore_refs); 856 ret = try_to_unmap_anon(page, migration);
874 else 857 else
875 ret = try_to_unmap_file(page, ignore_refs); 858 ret = try_to_unmap_file(page, migration);
876 859
877 if (!page_mapped(page)) 860 if (!page_mapped(page))
878 ret = SWAP_SUCCESS; 861 ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index 797eef3805ce..38bc3334f263 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1081,14 +1081,6 @@ repeat:
1081 page_cache_release(swappage); 1081 page_cache_release(swappage);
1082 goto repeat; 1082 goto repeat;
1083 } 1083 }
1084 if (!PageSwapCache(swappage)) {
1085 /* Page migration has occured */
1086 shmem_swp_unmap(entry);
1087 spin_unlock(&info->lock);
1088 unlock_page(swappage);
1089 page_cache_release(swappage);
1090 goto repeat;
1091 }
1092 if (PageWriteback(swappage)) { 1084 if (PageWriteback(swappage)) {
1093 shmem_swp_unmap(entry); 1085 shmem_swp_unmap(entry);
1094 spin_unlock(&info->lock); 1086 spin_unlock(&info->lock);
@@ -1654,9 +1646,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1654 return desc.error; 1646 return desc.error;
1655} 1647}
1656 1648
1657static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) 1649static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1658{ 1650{
1659 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1651 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1660 1652
1661 buf->f_type = TMPFS_MAGIC; 1653 buf->f_type = TMPFS_MAGIC;
1662 buf->f_bsize = PAGE_CACHE_SIZE; 1654 buf->f_bsize = PAGE_CACHE_SIZE;
@@ -2233,10 +2225,10 @@ static struct vm_operations_struct shmem_vm_ops = {
2233}; 2225};
2234 2226
2235 2227
2236static struct super_block *shmem_get_sb(struct file_system_type *fs_type, 2228static int shmem_get_sb(struct file_system_type *fs_type,
2237 int flags, const char *dev_name, void *data) 2229 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2238{ 2230{
2239 return get_sb_nodev(fs_type, flags, data, shmem_fill_super); 2231 return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
2240} 2232}
2241 2233
2242static struct file_system_type tmpfs_fs_type = { 2234static struct file_system_type tmpfs_fs_type = {
diff --git a/mm/slab.c b/mm/slab.c
index f1b644eb39d8..98ac20bc0de9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -331,6 +331,8 @@ static __always_inline int index_of(const size_t size)
331 return 0; 331 return 0;
332} 332}
333 333
334static int slab_early_init = 1;
335
334#define INDEX_AC index_of(sizeof(struct arraycache_init)) 336#define INDEX_AC index_of(sizeof(struct arraycache_init))
335#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 337#define INDEX_L3 index_of(sizeof(struct kmem_list3))
336 338
@@ -592,6 +594,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page)
592{ 594{
593 if (unlikely(PageCompound(page))) 595 if (unlikely(PageCompound(page)))
594 page = (struct page *)page_private(page); 596 page = (struct page *)page_private(page);
597 BUG_ON(!PageSlab(page));
595 return (struct kmem_cache *)page->lru.next; 598 return (struct kmem_cache *)page->lru.next;
596} 599}
597 600
@@ -604,6 +607,7 @@ static inline struct slab *page_get_slab(struct page *page)
604{ 607{
605 if (unlikely(PageCompound(page))) 608 if (unlikely(PageCompound(page)))
606 page = (struct page *)page_private(page); 609 page = (struct page *)page_private(page);
610 BUG_ON(!PageSlab(page));
607 return (struct slab *)page->lru.prev; 611 return (struct slab *)page->lru.prev;
608} 612}
609 613
@@ -1024,6 +1028,40 @@ static void drain_alien_cache(struct kmem_cache *cachep,
1024 } 1028 }
1025 } 1029 }
1026} 1030}
1031
1032static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1033{
1034 struct slab *slabp = virt_to_slab(objp);
1035 int nodeid = slabp->nodeid;
1036 struct kmem_list3 *l3;
1037 struct array_cache *alien = NULL;
1038
1039 /*
1040 * Make sure we are not freeing a object from another node to the array
1041 * cache on this cpu.
1042 */
1043 if (likely(slabp->nodeid == numa_node_id()))
1044 return 0;
1045
1046 l3 = cachep->nodelists[numa_node_id()];
1047 STATS_INC_NODEFREES(cachep);
1048 if (l3->alien && l3->alien[nodeid]) {
1049 alien = l3->alien[nodeid];
1050 spin_lock(&alien->lock);
1051 if (unlikely(alien->avail == alien->limit)) {
1052 STATS_INC_ACOVERFLOW(cachep);
1053 __drain_alien_cache(cachep, alien, nodeid);
1054 }
1055 alien->entry[alien->avail++] = objp;
1056 spin_unlock(&alien->lock);
1057 } else {
1058 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1059 free_block(cachep, &objp, 1, nodeid);
1060 spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1061 }
1062 return 1;
1063}
1064
1027#else 1065#else
1028 1066
1029#define drain_alien_cache(cachep, alien) do { } while (0) 1067#define drain_alien_cache(cachep, alien) do { } while (0)
@@ -1038,6 +1076,11 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
1038{ 1076{
1039} 1077}
1040 1078
1079static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1080{
1081 return 0;
1082}
1083
1041#endif 1084#endif
1042 1085
1043static int cpuup_callback(struct notifier_block *nfb, 1086static int cpuup_callback(struct notifier_block *nfb,
@@ -1335,6 +1378,8 @@ void __init kmem_cache_init(void)
1335 NULL, NULL); 1378 NULL, NULL);
1336 } 1379 }
1337 1380
1381 slab_early_init = 0;
1382
1338 while (sizes->cs_size != ULONG_MAX) { 1383 while (sizes->cs_size != ULONG_MAX) {
1339 /* 1384 /*
1340 * For performance, all the general caches are L1 aligned. 1385 * For performance, all the general caches are L1 aligned.
@@ -1450,31 +1495,29 @@ __initcall(cpucache_init);
1450static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1495static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1451{ 1496{
1452 struct page *page; 1497 struct page *page;
1453 void *addr; 1498 int nr_pages;
1454 int i; 1499 int i;
1455 1500
1456 flags |= cachep->gfpflags;
1457#ifndef CONFIG_MMU 1501#ifndef CONFIG_MMU
1458 /* nommu uses slab's for process anonymous memory allocations, so 1502 /*
1459 * requires __GFP_COMP to properly refcount higher order allocations" 1503 * Nommu uses slab's for process anonymous memory allocations, and thus
1504 * requires __GFP_COMP to properly refcount higher order allocations
1460 */ 1505 */
1461 page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder); 1506 flags |= __GFP_COMP;
1462#else
1463 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1464#endif 1507#endif
1508 flags |= cachep->gfpflags;
1509
1510 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1465 if (!page) 1511 if (!page)
1466 return NULL; 1512 return NULL;
1467 addr = page_address(page);
1468 1513
1469 i = (1 << cachep->gfporder); 1514 nr_pages = (1 << cachep->gfporder);
1470 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1515 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1471 atomic_add(i, &slab_reclaim_pages); 1516 atomic_add(nr_pages, &slab_reclaim_pages);
1472 add_page_state(nr_slab, i); 1517 add_page_state(nr_slab, nr_pages);
1473 while (i--) { 1518 for (i = 0; i < nr_pages; i++)
1474 __SetPageSlab(page); 1519 __SetPageSlab(page + i);
1475 page++; 1520 return page_address(page);
1476 }
1477 return addr;
1478} 1521}
1479 1522
1480/* 1523/*
@@ -1913,8 +1956,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1913 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 1956 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1914{ 1957{
1915 size_t left_over, slab_size, ralign; 1958 size_t left_over, slab_size, ralign;
1916 struct kmem_cache *cachep = NULL; 1959 struct kmem_cache *cachep = NULL, *pc;
1917 struct list_head *p;
1918 1960
1919 /* 1961 /*
1920 * Sanity checks... these are all serious usage bugs. 1962 * Sanity checks... these are all serious usage bugs.
@@ -1934,8 +1976,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1934 1976
1935 mutex_lock(&cache_chain_mutex); 1977 mutex_lock(&cache_chain_mutex);
1936 1978
1937 list_for_each(p, &cache_chain) { 1979 list_for_each_entry(pc, &cache_chain, next) {
1938 struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1939 mm_segment_t old_fs = get_fs(); 1980 mm_segment_t old_fs = get_fs();
1940 char tmp; 1981 char tmp;
1941 int res; 1982 int res;
@@ -2069,8 +2110,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2069#endif 2110#endif
2070#endif 2111#endif
2071 2112
2072 /* Determine if the slab management is 'on' or 'off' slab. */ 2113 /*
2073 if (size >= (PAGE_SIZE >> 3)) 2114 * Determine if the slab management is 'on' or 'off' slab.
2115 * (bootstrapping cannot cope with offslab caches so don't do
2116 * it too early on.)
2117 */
2118 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2074 /* 2119 /*
2075 * Size is large, assume best to place the slab management obj 2120 * Size is large, assume best to place the slab management obj
2076 * off-slab (should allow better packing of objs). 2121 * off-slab (should allow better packing of objs).
@@ -2460,23 +2505,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2460 slabp->inuse--; 2505 slabp->inuse--;
2461} 2506}
2462 2507
2463static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, 2508/*
2464 void *objp) 2509 * Map pages beginning at addr to the given cache and slab. This is required
2510 * for the slab allocator to be able to lookup the cache and slab of a
2511 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2512 */
2513static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2514 void *addr)
2465{ 2515{
2466 int i; 2516 int nr_pages;
2467 struct page *page; 2517 struct page *page;
2468 2518
2469 /* Nasty!!!!!! I hope this is OK. */ 2519 page = virt_to_page(addr);
2470 page = virt_to_page(objp);
2471 2520
2472 i = 1; 2521 nr_pages = 1;
2473 if (likely(!PageCompound(page))) 2522 if (likely(!PageCompound(page)))
2474 i <<= cachep->gfporder; 2523 nr_pages <<= cache->gfporder;
2524
2475 do { 2525 do {
2476 page_set_cache(page, cachep); 2526 page_set_cache(page, cache);
2477 page_set_slab(page, slabp); 2527 page_set_slab(page, slab);
2478 page++; 2528 page++;
2479 } while (--i); 2529 } while (--nr_pages);
2480} 2530}
2481 2531
2482/* 2532/*
@@ -2548,7 +2598,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2548 goto opps1; 2598 goto opps1;
2549 2599
2550 slabp->nodeid = nodeid; 2600 slabp->nodeid = nodeid;
2551 set_slab_attr(cachep, slabp, objp); 2601 slab_map_pages(cachep, slabp, objp);
2552 2602
2553 cache_init_objs(cachep, slabp, ctor_flags); 2603 cache_init_objs(cachep, slabp, ctor_flags);
2554 2604
@@ -2596,6 +2646,28 @@ static void kfree_debugcheck(const void *objp)
2596 } 2646 }
2597} 2647}
2598 2648
2649static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2650{
2651 unsigned long redzone1, redzone2;
2652
2653 redzone1 = *dbg_redzone1(cache, obj);
2654 redzone2 = *dbg_redzone2(cache, obj);
2655
2656 /*
2657 * Redzone is ok.
2658 */
2659 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2660 return;
2661
2662 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2663 slab_error(cache, "double free detected");
2664 else
2665 slab_error(cache, "memory outside object was overwritten");
2666
2667 printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2668 obj, redzone1, redzone2);
2669}
2670
2599static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2671static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2600 void *caller) 2672 void *caller)
2601{ 2673{
@@ -2607,27 +2679,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2607 kfree_debugcheck(objp); 2679 kfree_debugcheck(objp);
2608 page = virt_to_page(objp); 2680 page = virt_to_page(objp);
2609 2681
2610 if (page_get_cache(page) != cachep) {
2611 printk(KERN_ERR "mismatch in kmem_cache_free: expected "
2612 "cache %p, got %p\n",
2613 page_get_cache(page), cachep);
2614 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2615 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2616 page_get_cache(page)->name);
2617 WARN_ON(1);
2618 }
2619 slabp = page_get_slab(page); 2682 slabp = page_get_slab(page);
2620 2683
2621 if (cachep->flags & SLAB_RED_ZONE) { 2684 if (cachep->flags & SLAB_RED_ZONE) {
2622 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || 2685 verify_redzone_free(cachep, objp);
2623 *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2624 slab_error(cachep, "double free, or memory outside"
2625 " object was overwritten");
2626 printk(KERN_ERR "%p: redzone 1:0x%lx, "
2627 "redzone 2:0x%lx.\n",
2628 objp, *dbg_redzone1(cachep, objp),
2629 *dbg_redzone2(cachep, objp));
2630 }
2631 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2686 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2632 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2687 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2633 } 2688 }
@@ -3087,41 +3142,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3087 check_irq_off(); 3142 check_irq_off();
3088 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3143 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3089 3144
3090 /* Make sure we are not freeing a object from another 3145 if (cache_free_alien(cachep, objp))
3091 * node to the array cache on this cpu. 3146 return;
3092 */ 3147
3093#ifdef CONFIG_NUMA
3094 {
3095 struct slab *slabp;
3096 slabp = virt_to_slab(objp);
3097 if (unlikely(slabp->nodeid != numa_node_id())) {
3098 struct array_cache *alien = NULL;
3099 int nodeid = slabp->nodeid;
3100 struct kmem_list3 *l3;
3101
3102 l3 = cachep->nodelists[numa_node_id()];
3103 STATS_INC_NODEFREES(cachep);
3104 if (l3->alien && l3->alien[nodeid]) {
3105 alien = l3->alien[nodeid];
3106 spin_lock(&alien->lock);
3107 if (unlikely(alien->avail == alien->limit)) {
3108 STATS_INC_ACOVERFLOW(cachep);
3109 __drain_alien_cache(cachep,
3110 alien, nodeid);
3111 }
3112 alien->entry[alien->avail++] = objp;
3113 spin_unlock(&alien->lock);
3114 } else {
3115 spin_lock(&(cachep->nodelists[nodeid])->
3116 list_lock);
3117 free_block(cachep, &objp, 1, nodeid);
3118 spin_unlock(&(cachep->nodelists[nodeid])->
3119 list_lock);
3120 }
3121 return;
3122 }
3123 }
3124#endif
3125 if (likely(ac->avail < ac->limit)) { 3148 if (likely(ac->avail < ac->limit)) {
3126 STATS_INC_FREEHIT(cachep); 3149 STATS_INC_FREEHIT(cachep);
3127 ac->entry[ac->avail++] = objp; 3150 ac->entry[ac->avail++] = objp;
@@ -3254,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node);
3254#endif 3277#endif
3255 3278
3256/** 3279/**
3257 * kmalloc - allocate memory 3280 * __do_kmalloc - allocate memory
3258 * @size: how many bytes of memory are required. 3281 * @size: how many bytes of memory are required.
3259 * @flags: the type of memory to allocate. 3282 * @flags: the type of memory to allocate (see kmalloc).
3260 * @caller: function caller for debug tracking of the caller 3283 * @caller: function caller for debug tracking of the caller
3261 *
3262 * kmalloc is the normal method of allocating memory
3263 * in the kernel.
3264 *
3265 * The @flags argument may be one of:
3266 *
3267 * %GFP_USER - Allocate memory on behalf of user. May sleep.
3268 *
3269 * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
3270 *
3271 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
3272 *
3273 * Additionally, the %GFP_DMA flag may be set to indicate the memory
3274 * must be suitable for DMA. This can mean different things on different
3275 * platforms. For example, on i386, it means that the memory must come
3276 * from the first 16MB.
3277 */ 3284 */
3278static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3285static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3279 void *caller) 3286 void *caller)
@@ -3371,6 +3378,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3371{ 3378{
3372 unsigned long flags; 3379 unsigned long flags;
3373 3380
3381 BUG_ON(virt_to_cache(objp) != cachep);
3382
3374 local_irq_save(flags); 3383 local_irq_save(flags);
3375 __cache_free(cachep, objp); 3384 __cache_free(cachep, objp);
3376 local_irq_restore(flags); 3385 local_irq_restore(flags);
@@ -3680,7 +3689,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3680 */ 3689 */
3681static void cache_reap(void *unused) 3690static void cache_reap(void *unused)
3682{ 3691{
3683 struct list_head *walk; 3692 struct kmem_cache *searchp;
3684 struct kmem_list3 *l3; 3693 struct kmem_list3 *l3;
3685 int node = numa_node_id(); 3694 int node = numa_node_id();
3686 3695
@@ -3691,13 +3700,11 @@ static void cache_reap(void *unused)
3691 return; 3700 return;
3692 } 3701 }
3693 3702
3694 list_for_each(walk, &cache_chain) { 3703 list_for_each_entry(searchp, &cache_chain, next) {
3695 struct kmem_cache *searchp;
3696 struct list_head *p; 3704 struct list_head *p;
3697 int tofree; 3705 int tofree;
3698 struct slab *slabp; 3706 struct slab *slabp;
3699 3707
3700 searchp = list_entry(walk, struct kmem_cache, next);
3701 check_irq_on(); 3708 check_irq_on();
3702 3709
3703 /* 3710 /*
@@ -3825,7 +3832,6 @@ static void s_stop(struct seq_file *m, void *p)
3825static int s_show(struct seq_file *m, void *p) 3832static int s_show(struct seq_file *m, void *p)
3826{ 3833{
3827 struct kmem_cache *cachep = p; 3834 struct kmem_cache *cachep = p;
3828 struct list_head *q;
3829 struct slab *slabp; 3835 struct slab *slabp;
3830 unsigned long active_objs; 3836 unsigned long active_objs;
3831 unsigned long num_objs; 3837 unsigned long num_objs;
@@ -3846,15 +3852,13 @@ static int s_show(struct seq_file *m, void *p)
3846 check_irq_on(); 3852 check_irq_on();
3847 spin_lock_irq(&l3->list_lock); 3853 spin_lock_irq(&l3->list_lock);
3848 3854
3849 list_for_each(q, &l3->slabs_full) { 3855 list_for_each_entry(slabp, &l3->slabs_full, list) {
3850 slabp = list_entry(q, struct slab, list);
3851 if (slabp->inuse != cachep->num && !error) 3856 if (slabp->inuse != cachep->num && !error)
3852 error = "slabs_full accounting error"; 3857 error = "slabs_full accounting error";
3853 active_objs += cachep->num; 3858 active_objs += cachep->num;
3854 active_slabs++; 3859 active_slabs++;
3855 } 3860 }
3856 list_for_each(q, &l3->slabs_partial) { 3861 list_for_each_entry(slabp, &l3->slabs_partial, list) {
3857 slabp = list_entry(q, struct slab, list);
3858 if (slabp->inuse == cachep->num && !error) 3862 if (slabp->inuse == cachep->num && !error)
3859 error = "slabs_partial inuse accounting error"; 3863 error = "slabs_partial inuse accounting error";
3860 if (!slabp->inuse && !error) 3864 if (!slabp->inuse && !error)
@@ -3862,8 +3866,7 @@ static int s_show(struct seq_file *m, void *p)
3862 active_objs += slabp->inuse; 3866 active_objs += slabp->inuse;
3863 active_slabs++; 3867 active_slabs++;
3864 } 3868 }
3865 list_for_each(q, &l3->slabs_free) { 3869 list_for_each_entry(slabp, &l3->slabs_free, list) {
3866 slabp = list_entry(q, struct slab, list);
3867 if (slabp->inuse && !error) 3870 if (slabp->inuse && !error)
3868 error = "slabs_free/inuse accounting error"; 3871 error = "slabs_free/inuse accounting error";
3869 num_slabs++; 3872 num_slabs++;
@@ -3956,7 +3959,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3956{ 3959{
3957 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 3960 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3958 int limit, batchcount, shared, res; 3961 int limit, batchcount, shared, res;
3959 struct list_head *p; 3962 struct kmem_cache *cachep;
3960 3963
3961 if (count > MAX_SLABINFO_WRITE) 3964 if (count > MAX_SLABINFO_WRITE)
3962 return -EINVAL; 3965 return -EINVAL;
@@ -3975,10 +3978,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3975 /* Find the cache in the chain of caches. */ 3978 /* Find the cache in the chain of caches. */
3976 mutex_lock(&cache_chain_mutex); 3979 mutex_lock(&cache_chain_mutex);
3977 res = -EINVAL; 3980 res = -EINVAL;
3978 list_for_each(p, &cache_chain) { 3981 list_for_each_entry(cachep, &cache_chain, next) {
3979 struct kmem_cache *cachep;
3980
3981 cachep = list_entry(p, struct kmem_cache, next);
3982 if (!strcmp(cachep->name, kbuf)) { 3982 if (!strcmp(cachep->name, kbuf)) {
3983 if (limit < 1 || batchcount < 1 || 3983 if (limit < 1 || batchcount < 1 ||
3984 batchcount > limit || shared < 0) { 3984 batchcount > limit || shared < 0) {
@@ -4080,7 +4080,6 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4080static int leaks_show(struct seq_file *m, void *p) 4080static int leaks_show(struct seq_file *m, void *p)
4081{ 4081{
4082 struct kmem_cache *cachep = p; 4082 struct kmem_cache *cachep = p;
4083 struct list_head *q;
4084 struct slab *slabp; 4083 struct slab *slabp;
4085 struct kmem_list3 *l3; 4084 struct kmem_list3 *l3;
4086 const char *name; 4085 const char *name;
@@ -4105,14 +4104,10 @@ static int leaks_show(struct seq_file *m, void *p)
4105 check_irq_on(); 4104 check_irq_on();
4106 spin_lock_irq(&l3->list_lock); 4105 spin_lock_irq(&l3->list_lock);
4107 4106
4108 list_for_each(q, &l3->slabs_full) { 4107 list_for_each_entry(slabp, &l3->slabs_full, list)
4109 slabp = list_entry(q, struct slab, list);
4110 handle_slab(n, cachep, slabp); 4108 handle_slab(n, cachep, slabp);
4111 } 4109 list_for_each_entry(slabp, &l3->slabs_partial, list)
4112 list_for_each(q, &l3->slabs_partial) {
4113 slabp = list_entry(q, struct slab, list);
4114 handle_slab(n, cachep, slabp); 4110 handle_slab(n, cachep, slabp);
4115 }
4116 spin_unlock_irq(&l3->list_lock); 4111 spin_unlock_irq(&l3->list_lock);
4117 } 4112 }
4118 name = cachep->name; 4113 name = cachep->name;
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb6..e0a3fe48aa37 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms)
99 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 99 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
100} 100}
101 101
102/*
103 * During early boot, before section_mem_map is used for an actual
104 * mem_map, we use section_mem_map to store the section's NUMA
105 * node. This keeps us from having to use another data structure. The
106 * node information is cleared just before we store the real mem_map.
107 */
108static inline unsigned long sparse_encode_early_nid(int nid)
109{
110 return (nid << SECTION_NID_SHIFT);
111}
112
113static inline int sparse_early_nid(struct mem_section *section)
114{
115 return (section->section_mem_map >> SECTION_NID_SHIFT);
116}
117
102/* Record a memory area against a node. */ 118/* Record a memory area against a node. */
103void memory_present(int nid, unsigned long start, unsigned long end) 119void memory_present(int nid, unsigned long start, unsigned long end)
104{ 120{
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
113 129
114 ms = __nr_to_section(section); 130 ms = __nr_to_section(section);
115 if (!ms->section_mem_map) 131 if (!ms->section_mem_map)
116 ms->section_mem_map = SECTION_MARKED_PRESENT; 132 ms->section_mem_map = sparse_encode_early_nid(nid) |
133 SECTION_MARKED_PRESENT;
117 } 134 }
118} 135}
119 136
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms,
164 if (!valid_section(ms)) 181 if (!valid_section(ms))
165 return -EINVAL; 182 return -EINVAL;
166 183
184 ms->section_mem_map &= ~SECTION_MAP_MASK;
167 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); 185 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
168 186
169 return 1; 187 return 1;
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms,
172static struct page *sparse_early_mem_map_alloc(unsigned long pnum) 190static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
173{ 191{
174 struct page *map; 192 struct page *map;
175 int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
176 struct mem_section *ms = __nr_to_section(pnum); 193 struct mem_section *ms = __nr_to_section(pnum);
194 int nid = sparse_early_nid(ms);
177 195
178 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 196 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
179 if (map) 197 if (map)
diff --git a/mm/swap.c b/mm/swap.c
index 88895c249bc9..03ae2076f92f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -480,48 +480,6 @@ static int cpu_swap_callback(struct notifier_block *nfb,
480#endif /* CONFIG_HOTPLUG_CPU */ 480#endif /* CONFIG_HOTPLUG_CPU */
481#endif /* CONFIG_SMP */ 481#endif /* CONFIG_SMP */
482 482
483#ifdef CONFIG_SMP
484void percpu_counter_mod(struct percpu_counter *fbc, long amount)
485{
486 long count;
487 long *pcount;
488 int cpu = get_cpu();
489
490 pcount = per_cpu_ptr(fbc->counters, cpu);
491 count = *pcount + amount;
492 if (count >= FBC_BATCH || count <= -FBC_BATCH) {
493 spin_lock(&fbc->lock);
494 fbc->count += count;
495 *pcount = 0;
496 spin_unlock(&fbc->lock);
497 } else {
498 *pcount = count;
499 }
500 put_cpu();
501}
502EXPORT_SYMBOL(percpu_counter_mod);
503
504/*
505 * Add up all the per-cpu counts, return the result. This is a more accurate
506 * but much slower version of percpu_counter_read_positive()
507 */
508long percpu_counter_sum(struct percpu_counter *fbc)
509{
510 long ret;
511 int cpu;
512
513 spin_lock(&fbc->lock);
514 ret = fbc->count;
515 for_each_possible_cpu(cpu) {
516 long *pcount = per_cpu_ptr(fbc->counters, cpu);
517 ret += *pcount;
518 }
519 spin_unlock(&fbc->lock);
520 return ret < 0 ? 0 : ret;
521}
522EXPORT_SYMBOL(percpu_counter_sum);
523#endif
524
525/* 483/*
526 * Perform any setup for the swap system 484 * Perform any setup for the swap system
527 */ 485 */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5fd5385f0cc..cc367f7e75d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
395 struct swap_info_struct * p; 395 struct swap_info_struct * p;
396 struct page *page = NULL; 396 struct page *page = NULL;
397 397
398 if (is_migration_entry(entry))
399 return;
400
398 p = swap_info_get(entry); 401 p = swap_info_get(entry);
399 if (p) { 402 if (p) {
400 if (swap_entry_free(p, swp_offset(entry)) == 1) { 403 if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -615,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm,
615 return 0; 618 return 0;
616} 619}
617 620
618#ifdef CONFIG_MIGRATION
619int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
620{
621 swp_entry_t entry = { .val = page_private(page) };
622
623 return unuse_vma(vma, entry, page);
624}
625#endif
626
627/* 621/*
628 * Scan swap_map from current position to next entry still in use. 622 * Scan swap_map from current position to next entry still in use.
629 * Recycle to start on reaching the end, returning 0 when empty. 623 * Recycle to start on reaching the end, returning 0 when empty.
@@ -716,7 +710,6 @@ static int try_to_unuse(unsigned int type)
716 */ 710 */
717 swap_map = &si->swap_map[i]; 711 swap_map = &si->swap_map[i];
718 entry = swp_entry(type, i); 712 entry = swp_entry(type, i);
719again:
720 page = read_swap_cache_async(entry, NULL, 0); 713 page = read_swap_cache_async(entry, NULL, 0);
721 if (!page) { 714 if (!page) {
722 /* 715 /*
@@ -751,12 +744,6 @@ again:
751 wait_on_page_locked(page); 744 wait_on_page_locked(page);
752 wait_on_page_writeback(page); 745 wait_on_page_writeback(page);
753 lock_page(page); 746 lock_page(page);
754 if (!PageSwapCache(page)) {
755 /* Page migration has occured */
756 unlock_page(page);
757 page_cache_release(page);
758 goto again;
759 }
760 wait_on_page_writeback(page); 747 wait_on_page_writeback(page);
761 748
762 /* 749 /*
@@ -785,10 +772,8 @@ again:
785 while (*swap_map > 1 && !retval && 772 while (*swap_map > 1 && !retval &&
786 (p = p->next) != &start_mm->mmlist) { 773 (p = p->next) != &start_mm->mmlist) {
787 mm = list_entry(p, struct mm_struct, mmlist); 774 mm = list_entry(p, struct mm_struct, mmlist);
788 if (atomic_inc_return(&mm->mm_users) == 1) { 775 if (!atomic_inc_not_zero(&mm->mm_users))
789 atomic_dec(&mm->mm_users);
790 continue; 776 continue;
791 }
792 spin_unlock(&mmlist_lock); 777 spin_unlock(&mmlist_lock);
793 mmput(prev_mm); 778 mmput(prev_mm);
794 prev_mm = mm; 779 prev_mm = mm;
@@ -1407,19 +1392,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1407 if (!(p->flags & SWP_USED)) 1392 if (!(p->flags & SWP_USED))
1408 break; 1393 break;
1409 error = -EPERM; 1394 error = -EPERM;
1410 /* 1395 if (type >= MAX_SWAPFILES) {
1411 * Test if adding another swap device is possible. There are
1412 * two limiting factors: 1) the number of bits for the swap
1413 * type swp_entry_t definition and 2) the number of bits for
1414 * the swap type in the swap ptes as defined by the different
1415 * architectures. To honor both limitations a swap entry
1416 * with swap offset 0 and swap type ~0UL is created, encoded
1417 * to a swap pte, decoded to a swp_entry_t again and finally
1418 * the swap type part is extracted. This will mask all bits
1419 * from the initial ~0UL that can't be encoded in either the
1420 * swp_entry_t or the architecture definition of a swap pte.
1421 */
1422 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
1423 spin_unlock(&swap_lock); 1396 spin_unlock(&swap_lock);
1424 goto out; 1397 goto out;
1425 } 1398 }
@@ -1504,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1504 error = -EINVAL; 1477 error = -EINVAL;
1505 goto bad_swap; 1478 goto bad_swap;
1506 } 1479 }
1507 page = read_cache_page(mapping, 0, 1480 page = read_mapping_page(mapping, 0, swap_file);
1508 (filler_t *)mapping->a_ops->readpage, swap_file);
1509 if (IS_ERR(page)) { 1481 if (IS_ERR(page)) {
1510 error = PTR_ERR(page); 1482 error = PTR_ERR(page);
1511 goto bad_swap; 1483 goto bad_swap;
@@ -1709,6 +1681,9 @@ int swap_duplicate(swp_entry_t entry)
1709 unsigned long offset, type; 1681 unsigned long offset, type;
1710 int result = 0; 1682 int result = 0;
1711 1683
1684 if (is_migration_entry(entry))
1685 return 1;
1686
1712 type = swp_type(entry); 1687 type = swp_type(entry);
1713 if (type >= nr_swapfiles) 1688 if (type >= nr_swapfiles)
1714 goto bad_file; 1689 goto bad_file;
diff --git a/mm/truncate.c b/mm/truncate.c
index 6cb3fff25f67..cf1b015df4a7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
230 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 230 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
231 for (i = 0; i < pagevec_count(&pvec); i++) { 231 for (i = 0; i < pagevec_count(&pvec); i++) {
232 struct page *page = pvec.pages[i]; 232 struct page *page = pvec.pages[i];
233 pgoff_t index;
234 int lock_failed;
233 235
234 if (TestSetPageLocked(page)) { 236 lock_failed = TestSetPageLocked(page);
235 next++; 237
236 continue; 238 /*
237 } 239 * We really shouldn't be looking at the ->index of an
238 if (page->index > next) 240 * unlocked page. But we're not allowed to lock these
239 next = page->index; 241 * pages. So we rely upon nobody altering the ->index
242 * of this (pinned-by-us) page.
243 */
244 index = page->index;
245 if (index > next)
246 next = index;
240 next++; 247 next++;
248 if (lock_failed)
249 continue;
250
241 if (PageDirty(page) || PageWriteback(page)) 251 if (PageDirty(page) || PageWriteback(page))
242 goto unlock; 252 goto unlock;
243 if (page_mapped(page)) 253 if (page_mapped(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34eb..35f8553f893a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
257} 257}
258 258
259/* Caller must hold vmlist_lock */ 259/* Caller must hold vmlist_lock */
260static struct vm_struct *__find_vm_area(void *addr)
261{
262 struct vm_struct *tmp;
263
264 for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
265 if (tmp->addr == addr)
266 break;
267 }
268
269 return tmp;
270}
271
272/* Caller must hold vmlist_lock */
260struct vm_struct *__remove_vm_area(void *addr) 273struct vm_struct *__remove_vm_area(void *addr)
261{ 274{
262 struct vm_struct **p, *tmp; 275 struct vm_struct **p, *tmp;
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc);
498 */ 511 */
499void *vmalloc(unsigned long size) 512void *vmalloc(unsigned long size)
500{ 513{
501 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 514 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
502} 515}
503EXPORT_SYMBOL(vmalloc); 516EXPORT_SYMBOL(vmalloc);
504 517
505/** 518/**
519 * vmalloc_user - allocate virtually contiguous memory which has
520 * been zeroed so it can be mapped to userspace without
521 * leaking data.
522 *
523 * @size: allocation size
524 */
525void *vmalloc_user(unsigned long size)
526{
527 struct vm_struct *area;
528 void *ret;
529
530 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
531 write_lock(&vmlist_lock);
532 area = __find_vm_area(ret);
533 area->flags |= VM_USERMAP;
534 write_unlock(&vmlist_lock);
535
536 return ret;
537}
538EXPORT_SYMBOL(vmalloc_user);
539
540/**
506 * vmalloc_node - allocate memory on a specific node 541 * vmalloc_node - allocate memory on a specific node
507 * 542 *
508 * @size: allocation size 543 * @size: allocation size
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
516 */ 551 */
517void *vmalloc_node(unsigned long size, int node) 552void *vmalloc_node(unsigned long size, int node)
518{ 553{
519 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); 554 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
520} 555}
521EXPORT_SYMBOL(vmalloc_node); 556EXPORT_SYMBOL(vmalloc_node);
522 557
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
556} 591}
557EXPORT_SYMBOL(vmalloc_32); 592EXPORT_SYMBOL(vmalloc_32);
558 593
594/**
595 * vmalloc_32_user - allocate virtually contiguous memory (32bit
596 * addressable) which is zeroed so it can be
597 * mapped to userspace without leaking data.
598 *
599 * @size: allocation size
600 */
601void *vmalloc_32_user(unsigned long size)
602{
603 struct vm_struct *area;
604 void *ret;
605
606 ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
607 write_lock(&vmlist_lock);
608 area = __find_vm_area(ret);
609 area->flags |= VM_USERMAP;
610 write_unlock(&vmlist_lock);
611
612 return ret;
613}
614EXPORT_SYMBOL(vmalloc_32_user);
615
559long vread(char *buf, char *addr, unsigned long count) 616long vread(char *buf, char *addr, unsigned long count)
560{ 617{
561 struct vm_struct *tmp; 618 struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
630 read_unlock(&vmlist_lock); 687 read_unlock(&vmlist_lock);
631 return buf - buf_start; 688 return buf - buf_start;
632} 689}
690
691/**
692 * remap_vmalloc_range - map vmalloc pages to userspace
693 *
694 * @vma: vma to cover (map full range of vma)
695 * @addr: vmalloc memory
696 * @pgoff: number of pages into addr before first page to map
697 * @returns: 0 for success, -Exxx on failure
698 *
699 * This function checks that addr is a valid vmalloc'ed area, and
700 * that it is big enough to cover the vma. Will return failure if
701 * that criteria isn't met.
702 *
703 * Similar to remap_pfn_range (see mm/memory.c)
704 */
705int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
706 unsigned long pgoff)
707{
708 struct vm_struct *area;
709 unsigned long uaddr = vma->vm_start;
710 unsigned long usize = vma->vm_end - vma->vm_start;
711 int ret;
712
713 if ((PAGE_SIZE-1) & (unsigned long)addr)
714 return -EINVAL;
715
716 read_lock(&vmlist_lock);
717 area = __find_vm_area(addr);
718 if (!area)
719 goto out_einval_locked;
720
721 if (!(area->flags & VM_USERMAP))
722 goto out_einval_locked;
723
724 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
725 goto out_einval_locked;
726 read_unlock(&vmlist_lock);
727
728 addr += pgoff << PAGE_SHIFT;
729 do {
730 struct page *page = vmalloc_to_page(addr);
731 ret = vm_insert_page(vma, uaddr, page);
732 if (ret)
733 return ret;
734
735 uaddr += PAGE_SIZE;
736 addr += PAGE_SIZE;
737 usize -= PAGE_SIZE;
738 } while (usize > 0);
739
740 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
741 vma->vm_flags |= VM_RESERVED;
742
743 return ret;
744
745out_einval_locked:
746 read_unlock(&vmlist_lock);
747 return -EINVAL;
748}
749EXPORT_SYMBOL(remap_vmalloc_range);
750
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e9..72babac71dea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
61 * In this context, it doesn't matter that we scan the 61 * In this context, it doesn't matter that we scan the
62 * whole list at once. */ 62 * whole list at once. */
63 int swap_cluster_max; 63 int swap_cluster_max;
64
65 int swappiness;
64}; 66};
65 67
66/* 68/*
@@ -108,7 +110,7 @@ struct shrinker {
108 * From 0 .. 100. Higher means more swappy. 110 * From 0 .. 100. Higher means more swappy.
109 */ 111 */
110int vm_swappiness = 60; 112int vm_swappiness = 60;
111static long total_memory; 113long vm_total_pages; /* The total number of pages which the VM controls */
112 114
113static LIST_HEAD(shrinker_list); 115static LIST_HEAD(shrinker_list);
114static DECLARE_RWSEM(shrinker_rwsem); 116static DECLARE_RWSEM(shrinker_rwsem);
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
288 unlock_page(page); 290 unlock_page(page);
289} 291}
290 292
293/* possible outcome of pageout() */
294typedef enum {
295 /* failed to write page out, page is locked */
296 PAGE_KEEP,
297 /* move page to the active list, page is locked */
298 PAGE_ACTIVATE,
299 /* page has been sent to the disk successfully, page is unlocked */
300 PAGE_SUCCESS,
301 /* page is clean and locked */
302 PAGE_CLEAN,
303} pageout_t;
304
291/* 305/*
292 * pageout is called by shrink_page_list() for each dirty page. 306 * pageout is called by shrink_page_list() for each dirty page.
293 * Calls ->writepage(). 307 * Calls ->writepage().
294 */ 308 */
295pageout_t pageout(struct page *page, struct address_space *mapping) 309static pageout_t pageout(struct page *page, struct address_space *mapping)
296{ 310{
297 /* 311 /*
298 * If the page is dirty, only perform writeback if that write 312 * If the page is dirty, only perform writeback if that write
@@ -337,6 +351,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
337 struct writeback_control wbc = { 351 struct writeback_control wbc = {
338 .sync_mode = WB_SYNC_NONE, 352 .sync_mode = WB_SYNC_NONE,
339 .nr_to_write = SWAP_CLUSTER_MAX, 353 .nr_to_write = SWAP_CLUSTER_MAX,
354 .range_start = 0,
355 .range_end = LLONG_MAX,
340 .nonblocking = 1, 356 .nonblocking = 1,
341 .for_reclaim = 1, 357 .for_reclaim = 1,
342 }; 358 };
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
727 * how much memory 743 * how much memory
728 * is mapped. 744 * is mapped.
729 */ 745 */
730 mapped_ratio = (sc->nr_mapped * 100) / total_memory; 746 mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
731 747
732 /* 748 /*
733 * Now decide how much we really want to unmap some pages. The 749 * Now decide how much we really want to unmap some pages. The
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
741 * A 100% value of vm_swappiness overrides this algorithm 757 * A 100% value of vm_swappiness overrides this algorithm
742 * altogether. 758 * altogether.
743 */ 759 */
744 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; 760 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
745 761
746 /* 762 /*
747 * Now use this metric to decide whether to start moving mapped 763 * Now use this metric to decide whether to start moving mapped
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
957 .may_writepage = !laptop_mode, 973 .may_writepage = !laptop_mode,
958 .swap_cluster_max = SWAP_CLUSTER_MAX, 974 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1, 975 .may_swap = 1,
976 .swappiness = vm_swappiness,
960 }; 977 };
961 978
962 inc_page_state(allocstall); 979 inc_page_state(allocstall);
@@ -1021,10 +1038,6 @@ out:
1021 * For kswapd, balance_pgdat() will work across all this node's zones until 1038 * For kswapd, balance_pgdat() will work across all this node's zones until
1022 * they are all at pages_high. 1039 * they are all at pages_high.
1023 * 1040 *
1024 * If `nr_pages' is non-zero then it is the number of pages which are to be
1025 * reclaimed, regardless of the zone occupancies. This is a software suspend
1026 * special.
1027 *
1028 * Returns the number of pages which were actually freed. 1041 * Returns the number of pages which were actually freed.
1029 * 1042 *
1030 * There is special handling here for zones which are full of pinned pages. 1043 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1055,8 @@ out:
1042 * the page allocator fallback scheme to ensure that aging of pages is balanced 1055 * the page allocator fallback scheme to ensure that aging of pages is balanced
1043 * across the zones. 1056 * across the zones.
1044 */ 1057 */
1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, 1058static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1046 int order)
1047{ 1059{
1048 unsigned long to_free = nr_pages;
1049 int all_zones_ok; 1060 int all_zones_ok;
1050 int priority; 1061 int priority;
1051 int i; 1062 int i;
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1055 struct scan_control sc = { 1066 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL, 1067 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1, 1068 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, 1069 .swap_cluster_max = SWAP_CLUSTER_MAX,
1070 .swappiness = vm_swappiness,
1059 }; 1071 };
1060 1072
1061loop_again: 1073loop_again:
@@ -1082,31 +1094,26 @@ loop_again:
1082 1094
1083 all_zones_ok = 1; 1095 all_zones_ok = 1;
1084 1096
1085 if (nr_pages == 0) { 1097 /*
1086 /* 1098 * Scan in the highmem->dma direction for the highest
1087 * Scan in the highmem->dma direction for the highest 1099 * zone which needs scanning
1088 * zone which needs scanning 1100 */
1089 */ 1101 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1090 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1102 struct zone *zone = pgdat->node_zones + i;
1091 struct zone *zone = pgdat->node_zones + i;
1092 1103
1093 if (!populated_zone(zone)) 1104 if (!populated_zone(zone))
1094 continue; 1105 continue;
1095 1106
1096 if (zone->all_unreclaimable && 1107 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1097 priority != DEF_PRIORITY) 1108 continue;
1098 continue;
1099 1109
1100 if (!zone_watermark_ok(zone, order, 1110 if (!zone_watermark_ok(zone, order, zone->pages_high,
1101 zone->pages_high, 0, 0)) { 1111 0, 0)) {
1102 end_zone = i; 1112 end_zone = i;
1103 goto scan; 1113 goto scan;
1104 }
1105 } 1114 }
1106 goto out;
1107 } else {
1108 end_zone = pgdat->nr_zones - 1;
1109 } 1115 }
1116 goto out;
1110scan: 1117scan:
1111 for (i = 0; i <= end_zone; i++) { 1118 for (i = 0; i <= end_zone; i++) {
1112 struct zone *zone = pgdat->node_zones + i; 1119 struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1140,9 @@ scan:
1133 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1140 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1134 continue; 1141 continue;
1135 1142
1136 if (nr_pages == 0) { /* Not software suspend */ 1143 if (!zone_watermark_ok(zone, order, zone->pages_high,
1137 if (!zone_watermark_ok(zone, order, 1144 end_zone, 0))
1138 zone->pages_high, end_zone, 0)) 1145 all_zones_ok = 0;
1139 all_zones_ok = 0;
1140 }
1141 zone->temp_priority = priority; 1146 zone->temp_priority = priority;
1142 if (zone->prev_priority > priority) 1147 if (zone->prev_priority > priority)
1143 zone->prev_priority = priority; 1148 zone->prev_priority = priority;
@@ -1162,8 +1167,6 @@ scan:
1162 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1167 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1163 sc.may_writepage = 1; 1168 sc.may_writepage = 1;
1164 } 1169 }
1165 if (nr_pages && to_free > nr_reclaimed)
1166 continue; /* swsusp: need to do more work */
1167 if (all_zones_ok) 1170 if (all_zones_ok)
1168 break; /* kswapd: all done */ 1171 break; /* kswapd: all done */
1169 /* 1172 /*
@@ -1179,7 +1182,7 @@ scan:
1179 * matches the direct reclaim path behaviour in terms of impact 1182 * matches the direct reclaim path behaviour in terms of impact
1180 * on zone->*_priority. 1183 * on zone->*_priority.
1181 */ 1184 */
1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) 1185 if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1183 break; 1186 break;
1184 } 1187 }
1185out: 1188out:
@@ -1261,7 +1264,7 @@ static int kswapd(void *p)
1261 } 1264 }
1262 finish_wait(&pgdat->kswapd_wait, &wait); 1265 finish_wait(&pgdat->kswapd_wait, &wait);
1263 1266
1264 balance_pgdat(pgdat, 0, order); 1267 balance_pgdat(pgdat, order);
1265 } 1268 }
1266 return 0; 1269 return 0;
1267} 1270}
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order)
1290 1293
1291#ifdef CONFIG_PM 1294#ifdef CONFIG_PM
1292/* 1295/*
1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1296 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
1294 * pages. 1297 * from LRU lists system-wide, for given pass and priority, and returns the
1298 * number of reclaimed pages
1299 *
1300 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
1301 */
1302static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1303 int prio, struct scan_control *sc)
1304{
1305 struct zone *zone;
1306 unsigned long nr_to_scan, ret = 0;
1307
1308 for_each_zone(zone) {
1309
1310 if (!populated_zone(zone))
1311 continue;
1312
1313 if (zone->all_unreclaimable && prio != DEF_PRIORITY)
1314 continue;
1315
1316 /* For pass = 0 we don't shrink the active list */
1317 if (pass > 0) {
1318 zone->nr_scan_active += (zone->nr_active >> prio) + 1;
1319 if (zone->nr_scan_active >= nr_pages || pass > 3) {
1320 zone->nr_scan_active = 0;
1321 nr_to_scan = min(nr_pages, zone->nr_active);
1322 shrink_active_list(nr_to_scan, zone, sc);
1323 }
1324 }
1325
1326 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
1327 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1328 zone->nr_scan_inactive = 0;
1329 nr_to_scan = min(nr_pages, zone->nr_inactive);
1330 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1331 if (ret >= nr_pages)
1332 return ret;
1333 }
1334 }
1335
1336 return ret;
1337}
1338
1339/*
1340 * Try to free `nr_pages' of memory, system-wide, and return the number of
1341 * freed pages.
1342 *
1343 * Rather than trying to age LRUs the aim is to preserve the overall
1344 * LRU order by reclaiming preferentially
1345 * inactive > active > active referenced > active mapped
1295 */ 1346 */
1296unsigned long shrink_all_memory(unsigned long nr_pages) 1347unsigned long shrink_all_memory(unsigned long nr_pages)
1297{ 1348{
1298 pg_data_t *pgdat; 1349 unsigned long lru_pages, nr_slab;
1299 unsigned long nr_to_free = nr_pages;
1300 unsigned long ret = 0; 1350 unsigned long ret = 0;
1301 unsigned retry = 2; 1351 int pass;
1302 struct reclaim_state reclaim_state = { 1352 struct reclaim_state reclaim_state;
1303 .reclaimed_slab = 0, 1353 struct zone *zone;
1354 struct scan_control sc = {
1355 .gfp_mask = GFP_KERNEL,
1356 .may_swap = 0,
1357 .swap_cluster_max = nr_pages,
1358 .may_writepage = 1,
1359 .swappiness = vm_swappiness,
1304 }; 1360 };
1305 1361
1306 current->reclaim_state = &reclaim_state; 1362 current->reclaim_state = &reclaim_state;
1307repeat:
1308 for_each_online_pgdat(pgdat) {
1309 unsigned long freed;
1310 1363
1311 freed = balance_pgdat(pgdat, nr_to_free, 0); 1364 lru_pages = 0;
1312 ret += freed; 1365 for_each_zone(zone)
1313 nr_to_free -= freed; 1366 lru_pages += zone->nr_active + zone->nr_inactive;
1314 if ((long)nr_to_free <= 0) 1367
1368 nr_slab = read_page_state(nr_slab);
1369 /* If slab caches are huge, it's better to hit them first */
1370 while (nr_slab >= lru_pages) {
1371 reclaim_state.reclaimed_slab = 0;
1372 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1373 if (!reclaim_state.reclaimed_slab)
1315 break; 1374 break;
1375
1376 ret += reclaim_state.reclaimed_slab;
1377 if (ret >= nr_pages)
1378 goto out;
1379
1380 nr_slab -= reclaim_state.reclaimed_slab;
1316 } 1381 }
1317 if (retry-- && ret < nr_pages) { 1382
1318 blk_congestion_wait(WRITE, HZ/5); 1383 /*
1319 goto repeat; 1384 * We try to shrink LRUs in 5 passes:
1385 * 0 = Reclaim from inactive_list only
1386 * 1 = Reclaim from active list but don't reclaim mapped
1387 * 2 = 2nd pass of type 1
1388 * 3 = Reclaim mapped (normal reclaim)
1389 * 4 = 2nd pass of type 3
1390 */
1391 for (pass = 0; pass < 5; pass++) {
1392 int prio;
1393
1394 /* Needed for shrinking slab caches later on */
1395 if (!lru_pages)
1396 for_each_zone(zone) {
1397 lru_pages += zone->nr_active;
1398 lru_pages += zone->nr_inactive;
1399 }
1400
1401 /* Force reclaiming mapped pages in the passes #3 and #4 */
1402 if (pass > 2) {
1403 sc.may_swap = 1;
1404 sc.swappiness = 100;
1405 }
1406
1407 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
1408 unsigned long nr_to_scan = nr_pages - ret;
1409
1410 sc.nr_mapped = read_page_state(nr_mapped);
1411 sc.nr_scanned = 0;
1412
1413 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
1414 if (ret >= nr_pages)
1415 goto out;
1416
1417 reclaim_state.reclaimed_slab = 0;
1418 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
1419 ret += reclaim_state.reclaimed_slab;
1420 if (ret >= nr_pages)
1421 goto out;
1422
1423 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1424 blk_congestion_wait(WRITE, HZ / 10);
1425 }
1426
1427 lru_pages = 0;
1320 } 1428 }
1429
1430 /*
1431 * If ret = 0, we could not shrink LRUs, but there may be something
1432 * in slab caches
1433 */
1434 if (!ret)
1435 do {
1436 reclaim_state.reclaimed_slab = 0;
1437 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1438 ret += reclaim_state.reclaimed_slab;
1439 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1440
1441out:
1321 current->reclaim_state = NULL; 1442 current->reclaim_state = NULL;
1443
1322 return ret; 1444 return ret;
1323} 1445}
1324#endif 1446#endif
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void)
1360 pgdat->kswapd = find_task_by_pid(pid); 1482 pgdat->kswapd = find_task_by_pid(pid);
1361 read_unlock(&tasklist_lock); 1483 read_unlock(&tasklist_lock);
1362 } 1484 }
1363 total_memory = nr_free_pagecache_pages();
1364 hotcpu_notifier(cpu_callback, 0); 1485 hotcpu_notifier(cpu_callback, 0);
1365 return 0; 1486 return 0;
1366} 1487}
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1416 .swap_cluster_max = max_t(unsigned long, nr_pages, 1537 .swap_cluster_max = max_t(unsigned long, nr_pages,
1417 SWAP_CLUSTER_MAX), 1538 SWAP_CLUSTER_MAX),
1418 .gfp_mask = gfp_mask, 1539 .gfp_mask = gfp_mask,
1540 .swappiness = vm_swappiness,
1419 }; 1541 };
1420 1542
1421 disable_swap_token(); 1543 disable_swap_token();