aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-03-22 03:09:12 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-22 10:54:06 -0500
commitb20a35035f983f4ac7e29c4a68f30e43510007e0 (patch)
treefdf090ddddbcc275349f62f71adc98649e2c683b /mm
parent442295c94bf650221af3ef20fc68fa3e93876818 (diff)
[PATCH] page migration reorg
Centralize the page migration functions in anticipation of additional tinkering. Creates a new file mm/migrate.c 1. Extract buffer_migrate_page() from fs/buffer.c 2. Extract central migration code from vmscan.c 3. Extract some components from mempolicy.c 4. Export pageout() and remove_from_swap() from vmscan.c 5. Make it possible to configure NUMA systems without page migration and non-NUMA systems with page migration. I had to so some #ifdeffing in mempolicy.c that may need a cleanup. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/mempolicy.c113
-rw-r--r--mm/migrate.c655
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/vmscan.c491
6 files changed, 689 insertions, 579 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae6409..bd80460360db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
137# support for page migration 137# support for page migration
138# 138#
139config MIGRATION 139config MIGRATION
140 bool "Page migration"
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM 141 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
141 depends on SWAP 142 depends on SWAP
143 help
144 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for
146 example on NUMA systems to put pages nearer to the processors accessing
147 the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc3..f10c753dce6d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 22obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o
26
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 96195dcb62e1..e93cc740c22b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
86#include <linux/swap.h> 86#include <linux/swap.h>
87#include <linux/seq_file.h> 87#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 88#include <linux/proc_fs.h>
89#include <linux/migrate.h>
89 90
90#include <asm/tlbflush.h> 91#include <asm/tlbflush.h>
91#include <asm/uaccess.h> 92#include <asm/uaccess.h>
@@ -95,9 +96,6 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 96#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 97#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 98
98/* The number of pages to migrate per call to migrate_pages() */
99#define MIGRATE_CHUNK_SIZE 256
100
101static struct kmem_cache *policy_cache; 99static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache; 100static struct kmem_cache *sn_cache;
103 101
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 struct vm_area_struct *first, *vma, *prev; 329 struct vm_area_struct *first, *vma, *prev;
332 330
333 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 331 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
334 /* Must have swap device for migration */
335 if (nr_swap_pages <= 0)
336 return ERR_PTR(-ENODEV);
337 332
338 /* 333 err = migrate_prep();
339 * Clear the LRU lists so pages can be isolated. 334 if (err)
340 * Note that pages may be moved off the LRU after we have 335 return ERR_PTR(err);
341 * drained them. Those pages will fail to migrate like other
342 * pages that may be busy.
343 */
344 lru_add_drain_all();
345 } 336 }
346 337
347 first = find_vma(mm, start); 338 first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
550 return err; 541 return err;
551} 542}
552 543
544#ifdef CONFIG_MIGRATION
553/* 545/*
554 * page migration 546 * page migration
555 */ 547 */
556
557static void migrate_page_add(struct page *page, struct list_head *pagelist, 548static void migrate_page_add(struct page *page, struct list_head *pagelist,
558 unsigned long flags) 549 unsigned long flags)
559{ 550{
560 /* 551 /*
561 * Avoid migrating a page that is shared with others. 552 * Avoid migrating a page that is shared with others.
562 */ 553 */
563 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 554 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
564 if (isolate_lru_page(page)) 555 isolate_lru_page(page, pagelist);
565 list_add_tail(&page->lru, pagelist);
566 }
567}
568
569/*
570 * Migrate the list 'pagelist' of pages to a certain destination.
571 *
572 * Specify destination with either non-NULL vma or dest_node >= 0
573 * Return the number of pages not migrated or error code
574 */
575static int migrate_pages_to(struct list_head *pagelist,
576 struct vm_area_struct *vma, int dest)
577{
578 LIST_HEAD(newlist);
579 LIST_HEAD(moved);
580 LIST_HEAD(failed);
581 int err = 0;
582 unsigned long offset = 0;
583 int nr_pages;
584 struct page *page;
585 struct list_head *p;
586
587redo:
588 nr_pages = 0;
589 list_for_each(p, pagelist) {
590 if (vma) {
591 /*
592 * The address passed to alloc_page_vma is used to
593 * generate the proper interleave behavior. We fake
594 * the address here by an increasing offset in order
595 * to get the proper distribution of pages.
596 *
597 * No decision has been made as to which page
598 * a certain old page is moved to so we cannot
599 * specify the correct address.
600 */
601 page = alloc_page_vma(GFP_HIGHUSER, vma,
602 offset + vma->vm_start);
603 offset += PAGE_SIZE;
604 }
605 else
606 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
607
608 if (!page) {
609 err = -ENOMEM;
610 goto out;
611 }
612 list_add_tail(&page->lru, &newlist);
613 nr_pages++;
614 if (nr_pages > MIGRATE_CHUNK_SIZE)
615 break;
616 }
617 err = migrate_pages(pagelist, &newlist, &moved, &failed);
618
619 putback_lru_pages(&moved); /* Call release pages instead ?? */
620
621 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
622 goto redo;
623out:
624 /* Return leftover allocated pages */
625 while (!list_empty(&newlist)) {
626 page = list_entry(newlist.next, struct page, lru);
627 list_del(&page->lru);
628 __free_page(page);
629 }
630 list_splice(&failed, pagelist);
631 if (err < 0)
632 return err;
633
634 /* Calculate number of leftover pages */
635 nr_pages = 0;
636 list_for_each(p, pagelist)
637 nr_pages++;
638 return nr_pages;
639} 556}
640 557
641/* 558/*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
742 if (err < 0) 659 if (err < 0)
743 return err; 660 return err;
744 return busy; 661 return busy;
662
745} 663}
746 664
665#else
666
667static void migrate_page_add(struct page *page, struct list_head *pagelist,
668 unsigned long flags)
669{
670}
671
672int do_migrate_pages(struct mm_struct *mm,
673 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
674{
675 return -ENOSYS;
676}
677#endif
678
747long do_mbind(unsigned long start, unsigned long len, 679long do_mbind(unsigned long start, unsigned long len,
748 unsigned long mode, nodemask_t *nmask, unsigned long flags) 680 unsigned long mode, nodemask_t *nmask, unsigned long flags)
749{ 681{
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
808 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 740 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
809 err = -EIO; 741 err = -EIO;
810 } 742 }
743
811 if (!list_empty(&pagelist)) 744 if (!list_empty(&pagelist))
812 putback_lru_pages(&pagelist); 745 putback_lru_pages(&pagelist);
813 746
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000000..09f6e4aa87fc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter <clameter@sgi.com>
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/buffer_head.h> /* for try_to_release_page(),
20 buffer_heads_over_limit */
21#include <linux/mm_inline.h>
22#include <linux/pagevec.h>
23#include <linux/rmap.h>
24#include <linux/topology.h>
25#include <linux/cpu.h>
26#include <linux/cpuset.h>
27#include <linux/swapops.h>
28
29#include "internal.h"
30
31#include "internal.h"
32
33/* The maximum number of pages to take off the LRU for migration */
34#define MIGRATE_CHUNK_SIZE 256
35
36#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
37
38/*
39 * Isolate one page from the LRU lists. If successful put it onto
40 * the indicated list with elevated page count.
41 *
42 * Result:
43 * -EBUSY: page not on LRU list
44 * 0: page removed from LRU list and added to the specified list.
45 */
46int isolate_lru_page(struct page *page, struct list_head *pagelist)
47{
48 int ret = -EBUSY;
49
50 if (PageLRU(page)) {
51 struct zone *zone = page_zone(page);
52
53 spin_lock_irq(&zone->lru_lock);
54 if (PageLRU(page)) {
55 ret = 0;
56 get_page(page);
57 ClearPageLRU(page);
58 if (PageActive(page))
59 del_page_from_active_list(zone, page);
60 else
61 del_page_from_inactive_list(zone, page);
62 list_add_tail(&page->lru, pagelist);
63 }
64 spin_unlock_irq(&zone->lru_lock);
65 }
66 return ret;
67}
68
69/*
70 * migrate_prep() needs to be called after we have compiled the list of pages
71 * to be migrated using isolate_lru_page() but before we begin a series of calls
72 * to migrate_pages().
73 */
74int migrate_prep(void)
75{
76 /* Must have swap device for migration */
77 if (nr_swap_pages <= 0)
78 return -ENODEV;
79
80 /*
81 * Clear the LRU lists so pages can be isolated.
82 * Note that pages may be moved off the LRU after we have
83 * drained them. Those pages will fail to migrate like other
84 * pages that may be busy.
85 */
86 lru_add_drain_all();
87
88 return 0;
89}
90
91static inline void move_to_lru(struct page *page)
92{
93 list_del(&page->lru);
94 if (PageActive(page)) {
95 /*
96 * lru_cache_add_active checks that
97 * the PG_active bit is off.
98 */
99 ClearPageActive(page);
100 lru_cache_add_active(page);
101 } else {
102 lru_cache_add(page);
103 }
104 put_page(page);
105}
106
107/*
108 * Add isolated pages on the list back to the LRU.
109 *
110 * returns the number of pages put back.
111 */
112int putback_lru_pages(struct list_head *l)
113{
114 struct page *page;
115 struct page *page2;
116 int count = 0;
117
118 list_for_each_entry_safe(page, page2, l, lru) {
119 move_to_lru(page);
120 count++;
121 }
122 return count;
123}
124
125/*
126 * Non migratable page
127 */
128int fail_migrate_page(struct page *newpage, struct page *page)
129{
130 return -EIO;
131}
132EXPORT_SYMBOL(fail_migrate_page);
133
134/*
135 * swapout a single page
136 * page is locked upon entry, unlocked on exit
137 */
138static int swap_page(struct page *page)
139{
140 struct address_space *mapping = page_mapping(page);
141
142 if (page_mapped(page) && mapping)
143 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
144 goto unlock_retry;
145
146 if (PageDirty(page)) {
147 /* Page is dirty, try to write it out here */
148 switch(pageout(page, mapping)) {
149 case PAGE_KEEP:
150 case PAGE_ACTIVATE:
151 goto unlock_retry;
152
153 case PAGE_SUCCESS:
154 goto retry;
155
156 case PAGE_CLEAN:
157 ; /* try to free the page below */
158 }
159 }
160
161 if (PagePrivate(page)) {
162 if (!try_to_release_page(page, GFP_KERNEL) ||
163 (!mapping && page_count(page) == 1))
164 goto unlock_retry;
165 }
166
167 if (remove_mapping(mapping, page)) {
168 /* Success */
169 unlock_page(page);
170 return 0;
171 }
172
173unlock_retry:
174 unlock_page(page);
175
176retry:
177 return -EAGAIN;
178}
179EXPORT_SYMBOL(swap_page);
180
181/*
182 * Remove references for a page and establish the new page with the correct
183 * basic settings to be able to stop accesses to the page.
184 */
185int migrate_page_remove_references(struct page *newpage,
186 struct page *page, int nr_refs)
187{
188 struct address_space *mapping = page_mapping(page);
189 struct page **radix_pointer;
190
191 /*
192 * Avoid doing any of the following work if the page count
193 * indicates that the page is in use or truncate has removed
194 * the page.
195 */
196 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
197 return -EAGAIN;
198
199 /*
200 * Establish swap ptes for anonymous pages or destroy pte
201 * maps for files.
202 *
203 * In order to reestablish file backed mappings the fault handlers
204 * will take the radix tree_lock which may then be used to stop
205 * processses from accessing this page until the new page is ready.
206 *
207 * A process accessing via a swap pte (an anonymous page) will take a
208 * page_lock on the old page which will block the process until the
209 * migration attempt is complete. At that time the PageSwapCache bit
210 * will be examined. If the page was migrated then the PageSwapCache
211 * bit will be clear and the operation to retrieve the page will be
212 * retried which will find the new page in the radix tree. Then a new
213 * direct mapping may be generated based on the radix tree contents.
214 *
215 * If the page was not migrated then the PageSwapCache bit
216 * is still set and the operation may continue.
217 */
218 if (try_to_unmap(page, 1) == SWAP_FAIL)
219 /* A vma has VM_LOCKED set -> permanent failure */
220 return -EPERM;
221
222 /*
223 * Give up if we were unable to remove all mappings.
224 */
225 if (page_mapcount(page))
226 return -EAGAIN;
227
228 write_lock_irq(&mapping->tree_lock);
229
230 radix_pointer = (struct page **)radix_tree_lookup_slot(
231 &mapping->page_tree,
232 page_index(page));
233
234 if (!page_mapping(page) || page_count(page) != nr_refs ||
235 *radix_pointer != page) {
236 write_unlock_irq(&mapping->tree_lock);
237 return 1;
238 }
239
240 /*
241 * Now we know that no one else is looking at the page.
242 *
243 * Certain minimal information about a page must be available
244 * in order for other subsystems to properly handle the page if they
245 * find it through the radix tree update before we are finished
246 * copying the page.
247 */
248 get_page(newpage);
249 newpage->index = page->index;
250 newpage->mapping = page->mapping;
251 if (PageSwapCache(page)) {
252 SetPageSwapCache(newpage);
253 set_page_private(newpage, page_private(page));
254 }
255
256 *radix_pointer = newpage;
257 __put_page(page);
258 write_unlock_irq(&mapping->tree_lock);
259
260 return 0;
261}
262EXPORT_SYMBOL(migrate_page_remove_references);
263
264/*
265 * Copy the page to its new location
266 */
267void migrate_page_copy(struct page *newpage, struct page *page)
268{
269 copy_highpage(newpage, page);
270
271 if (PageError(page))
272 SetPageError(newpage);
273 if (PageReferenced(page))
274 SetPageReferenced(newpage);
275 if (PageUptodate(page))
276 SetPageUptodate(newpage);
277 if (PageActive(page))
278 SetPageActive(newpage);
279 if (PageChecked(page))
280 SetPageChecked(newpage);
281 if (PageMappedToDisk(page))
282 SetPageMappedToDisk(newpage);
283
284 if (PageDirty(page)) {
285 clear_page_dirty_for_io(page);
286 set_page_dirty(newpage);
287 }
288
289 ClearPageSwapCache(page);
290 ClearPageActive(page);
291 ClearPagePrivate(page);
292 set_page_private(page, 0);
293 page->mapping = NULL;
294
295 /*
296 * If any waiters have accumulated on the new page then
297 * wake them up.
298 */
299 if (PageWriteback(newpage))
300 end_page_writeback(newpage);
301}
302EXPORT_SYMBOL(migrate_page_copy);
303
304/*
305 * Common logic to directly migrate a single page suitable for
306 * pages that do not use PagePrivate.
307 *
308 * Pages are locked upon entry and exit.
309 */
310int migrate_page(struct page *newpage, struct page *page)
311{
312 int rc;
313
314 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
315
316 rc = migrate_page_remove_references(newpage, page, 2);
317
318 if (rc)
319 return rc;
320
321 migrate_page_copy(newpage, page);
322
323 /*
324 * Remove auxiliary swap entries and replace
325 * them with real ptes.
326 *
327 * Note that a real pte entry will allow processes that are not
328 * waiting on the page lock to use the new page via the page tables
329 * before the new page is unlocked.
330 */
331 remove_from_swap(newpage);
332 return 0;
333}
334EXPORT_SYMBOL(migrate_page);
335
336/*
337 * migrate_pages
338 *
339 * Two lists are passed to this function. The first list
340 * contains the pages isolated from the LRU to be migrated.
341 * The second list contains new pages that the pages isolated
342 * can be moved to. If the second list is NULL then all
343 * pages are swapped out.
344 *
345 * The function returns after 10 attempts or if no pages
346 * are movable anymore because to has become empty
347 * or no retryable pages exist anymore.
348 *
349 * Return: Number of pages not migrated when "to" ran empty.
350 */
351int migrate_pages(struct list_head *from, struct list_head *to,
352 struct list_head *moved, struct list_head *failed)
353{
354 int retry;
355 int nr_failed = 0;
356 int pass = 0;
357 struct page *page;
358 struct page *page2;
359 int swapwrite = current->flags & PF_SWAPWRITE;
360 int rc;
361
362 if (!swapwrite)
363 current->flags |= PF_SWAPWRITE;
364
365redo:
366 retry = 0;
367
368 list_for_each_entry_safe(page, page2, from, lru) {
369 struct page *newpage = NULL;
370 struct address_space *mapping;
371
372 cond_resched();
373
374 rc = 0;
375 if (page_count(page) == 1)
376 /* page was freed from under us. So we are done. */
377 goto next;
378
379 if (to && list_empty(to))
380 break;
381
382 /*
383 * Skip locked pages during the first two passes to give the
384 * functions holding the lock time to release the page. Later we
385 * use lock_page() to have a higher chance of acquiring the
386 * lock.
387 */
388 rc = -EAGAIN;
389 if (pass > 2)
390 lock_page(page);
391 else
392 if (TestSetPageLocked(page))
393 goto next;
394
395 /*
396 * Only wait on writeback if we have already done a pass where
397 * we we may have triggered writeouts for lots of pages.
398 */
399 if (pass > 0) {
400 wait_on_page_writeback(page);
401 } else {
402 if (PageWriteback(page))
403 goto unlock_page;
404 }
405
406 /*
407 * Anonymous pages must have swap cache references otherwise
408 * the information contained in the page maps cannot be
409 * preserved.
410 */
411 if (PageAnon(page) && !PageSwapCache(page)) {
412 if (!add_to_swap(page, GFP_KERNEL)) {
413 rc = -ENOMEM;
414 goto unlock_page;
415 }
416 }
417
418 if (!to) {
419 rc = swap_page(page);
420 goto next;
421 }
422
423 newpage = lru_to_page(to);
424 lock_page(newpage);
425
426 /*
427 * Pages are properly locked and writeback is complete.
428 * Try to migrate the page.
429 */
430 mapping = page_mapping(page);
431 if (!mapping)
432 goto unlock_both;
433
434 if (mapping->a_ops->migratepage) {
435 /*
436 * Most pages have a mapping and most filesystems
437 * should provide a migration function. Anonymous
438 * pages are part of swap space which also has its
439 * own migration function. This is the most common
440 * path for page migration.
441 */
442 rc = mapping->a_ops->migratepage(newpage, page);
443 goto unlock_both;
444 }
445
446 /*
447 * Default handling if a filesystem does not provide
448 * a migration function. We can only migrate clean
449 * pages so try to write out any dirty pages first.
450 */
451 if (PageDirty(page)) {
452 switch (pageout(page, mapping)) {
453 case PAGE_KEEP:
454 case PAGE_ACTIVATE:
455 goto unlock_both;
456
457 case PAGE_SUCCESS:
458 unlock_page(newpage);
459 goto next;
460
461 case PAGE_CLEAN:
462 ; /* try to migrate the page below */
463 }
464 }
465
466 /*
467 * Buffers are managed in a filesystem specific way.
468 * We must have no buffers or drop them.
469 */
470 if (!page_has_buffers(page) ||
471 try_to_release_page(page, GFP_KERNEL)) {
472 rc = migrate_page(newpage, page);
473 goto unlock_both;
474 }
475
476 /*
477 * On early passes with mapped pages simply
478 * retry. There may be a lock held for some
479 * buffers that may go away. Later
480 * swap them out.
481 */
482 if (pass > 4) {
483 /*
484 * Persistently unable to drop buffers..... As a
485 * measure of last resort we fall back to
486 * swap_page().
487 */
488 unlock_page(newpage);
489 newpage = NULL;
490 rc = swap_page(page);
491 goto next;
492 }
493
494unlock_both:
495 unlock_page(newpage);
496
497unlock_page:
498 unlock_page(page);
499
500next:
501 if (rc == -EAGAIN) {
502 retry++;
503 } else if (rc) {
504 /* Permanent failure */
505 list_move(&page->lru, failed);
506 nr_failed++;
507 } else {
508 if (newpage) {
509 /* Successful migration. Return page to LRU */
510 move_to_lru(newpage);
511 }
512 list_move(&page->lru, moved);
513 }
514 }
515 if (retry && pass++ < 10)
516 goto redo;
517
518 if (!swapwrite)
519 current->flags &= ~PF_SWAPWRITE;
520
521 return nr_failed + retry;
522}
523
524/*
525 * Migration function for pages with buffers. This function can only be used
526 * if the underlying filesystem guarantees that no other references to "page"
527 * exist.
528 */
529int buffer_migrate_page(struct page *newpage, struct page *page)
530{
531 struct address_space *mapping = page->mapping;
532 struct buffer_head *bh, *head;
533 int rc;
534
535 if (!mapping)
536 return -EAGAIN;
537
538 if (!page_has_buffers(page))
539 return migrate_page(newpage, page);
540
541 head = page_buffers(page);
542
543 rc = migrate_page_remove_references(newpage, page, 3);
544
545 if (rc)
546 return rc;
547
548 bh = head;
549 do {
550 get_bh(bh);
551 lock_buffer(bh);
552 bh = bh->b_this_page;
553
554 } while (bh != head);
555
556 ClearPagePrivate(page);
557 set_page_private(newpage, page_private(page));
558 set_page_private(page, 0);
559 put_page(page);
560 get_page(newpage);
561
562 bh = head;
563 do {
564 set_bh_page(bh, newpage, bh_offset(bh));
565 bh = bh->b_this_page;
566
567 } while (bh != head);
568
569 SetPagePrivate(newpage);
570
571 migrate_page_copy(newpage, page);
572
573 bh = head;
574 do {
575 unlock_buffer(bh);
576 put_bh(bh);
577 bh = bh->b_this_page;
578
579 } while (bh != head);
580
581 return 0;
582}
583EXPORT_SYMBOL(buffer_migrate_page);
584
585/*
586 * Migrate the list 'pagelist' of pages to a certain destination.
587 *
588 * Specify destination with either non-NULL vma or dest_node >= 0
589 * Return the number of pages not migrated or error code
590 */
591int migrate_pages_to(struct list_head *pagelist,
592 struct vm_area_struct *vma, int dest)
593{
594 LIST_HEAD(newlist);
595 LIST_HEAD(moved);
596 LIST_HEAD(failed);
597 int err = 0;
598 unsigned long offset = 0;
599 int nr_pages;
600 struct page *page;
601 struct list_head *p;
602
603redo:
604 nr_pages = 0;
605 list_for_each(p, pagelist) {
606 if (vma) {
607 /*
608 * The address passed to alloc_page_vma is used to
609 * generate the proper interleave behavior. We fake
610 * the address here by an increasing offset in order
611 * to get the proper distribution of pages.
612 *
613 * No decision has been made as to which page
614 * a certain old page is moved to so we cannot
615 * specify the correct address.
616 */
617 page = alloc_page_vma(GFP_HIGHUSER, vma,
618 offset + vma->vm_start);
619 offset += PAGE_SIZE;
620 }
621 else
622 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
623
624 if (!page) {
625 err = -ENOMEM;
626 goto out;
627 }
628 list_add_tail(&page->lru, &newlist);
629 nr_pages++;
630 if (nr_pages > MIGRATE_CHUNK_SIZE)
631 break;
632 }
633 err = migrate_pages(pagelist, &newlist, &moved, &failed);
634
635 putback_lru_pages(&moved); /* Call release pages instead ?? */
636
637 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
638 goto redo;
639out:
640 /* Return leftover allocated pages */
641 while (!list_empty(&newlist)) {
642 page = list_entry(newlist.next, struct page, lru);
643 list_del(&page->lru);
644 __free_page(page);
645 }
646 list_splice(&failed, pagelist);
647 if (err < 0)
648 return err;
649
650 /* Calculate number of leftover pages */
651 nr_pages = 0;
652 list_for_each(p, pagelist)
653 nr_pages++;
654 return nr_pages;
655}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e1636..d7af296833fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h> 17#include <linux/pagevec.h>
18#include <linux/migrate.h>
18 19
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20 21
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 548e023c193b..fd572bbdc9f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,18 +42,6 @@
42 42
43#include "internal.h" 43#include "internal.h"
44 44
45/* possible outcome of pageout() */
46typedef enum {
47 /* failed to write page out, page is locked */
48 PAGE_KEEP,
49 /* move page to the active list, page is locked */
50 PAGE_ACTIVATE,
51 /* page has been sent to the disk successfully, page is unlocked */
52 PAGE_SUCCESS,
53 /* page is clean and locked */
54 PAGE_CLEAN,
55} pageout_t;
56
57struct scan_control { 45struct scan_control {
58 /* Incremented by the number of inactive pages that were scanned */ 46 /* Incremented by the number of inactive pages that were scanned */
59 unsigned long nr_scanned; 47 unsigned long nr_scanned;
@@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping,
304 * pageout is called by shrink_page_list() for each dirty page. 292 * pageout is called by shrink_page_list() for each dirty page.
305 * Calls ->writepage(). 293 * Calls ->writepage().
306 */ 294 */
307static pageout_t pageout(struct page *page, struct address_space *mapping) 295pageout_t pageout(struct page *page, struct address_space *mapping)
308{ 296{
309 /* 297 /*
310 * If the page is dirty, only perform writeback if that write 298 * If the page is dirty, only perform writeback if that write
@@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
372 return PAGE_CLEAN; 360 return PAGE_CLEAN;
373} 361}
374 362
375static int remove_mapping(struct address_space *mapping, struct page *page) 363int remove_mapping(struct address_space *mapping, struct page *page)
376{ 364{
377 if (!mapping) 365 if (!mapping)
378 return 0; /* truncate got there first */ 366 return 0; /* truncate got there first */
@@ -570,481 +558,6 @@ keep:
570 return nr_reclaimed; 558 return nr_reclaimed;
571} 559}
572 560
573#ifdef CONFIG_MIGRATION
574static inline void move_to_lru(struct page *page)
575{
576 list_del(&page->lru);
577 if (PageActive(page)) {
578 /*
579 * lru_cache_add_active checks that
580 * the PG_active bit is off.
581 */
582 ClearPageActive(page);
583 lru_cache_add_active(page);
584 } else {
585 lru_cache_add(page);
586 }
587 put_page(page);
588}
589
590/*
591 * Add isolated pages on the list back to the LRU.
592 *
593 * returns the number of pages put back.
594 */
595unsigned long putback_lru_pages(struct list_head *l)
596{
597 struct page *page;
598 struct page *page2;
599 unsigned long count = 0;
600
601 list_for_each_entry_safe(page, page2, l, lru) {
602 move_to_lru(page);
603 count++;
604 }
605 return count;
606}
607
608/*
609 * Non migratable page
610 */
611int fail_migrate_page(struct page *newpage, struct page *page)
612{
613 return -EIO;
614}
615EXPORT_SYMBOL(fail_migrate_page);
616
617/*
618 * swapout a single page
619 * page is locked upon entry, unlocked on exit
620 */
621static int swap_page(struct page *page)
622{
623 struct address_space *mapping = page_mapping(page);
624
625 if (page_mapped(page) && mapping)
626 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
627 goto unlock_retry;
628
629 if (PageDirty(page)) {
630 /* Page is dirty, try to write it out here */
631 switch(pageout(page, mapping)) {
632 case PAGE_KEEP:
633 case PAGE_ACTIVATE:
634 goto unlock_retry;
635
636 case PAGE_SUCCESS:
637 goto retry;
638
639 case PAGE_CLEAN:
640 ; /* try to free the page below */
641 }
642 }
643
644 if (PagePrivate(page)) {
645 if (!try_to_release_page(page, GFP_KERNEL) ||
646 (!mapping && page_count(page) == 1))
647 goto unlock_retry;
648 }
649
650 if (remove_mapping(mapping, page)) {
651 /* Success */
652 unlock_page(page);
653 return 0;
654 }
655
656unlock_retry:
657 unlock_page(page);
658
659retry:
660 return -EAGAIN;
661}
662EXPORT_SYMBOL(swap_page);
663
664/*
665 * Page migration was first developed in the context of the memory hotplug
666 * project. The main authors of the migration code are:
667 *
668 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
669 * Hirokazu Takahashi <taka@valinux.co.jp>
670 * Dave Hansen <haveblue@us.ibm.com>
671 * Christoph Lameter <clameter@sgi.com>
672 */
673
674/*
675 * Remove references for a page and establish the new page with the correct
676 * basic settings to be able to stop accesses to the page.
677 */
678int migrate_page_remove_references(struct page *newpage,
679 struct page *page, int nr_refs)
680{
681 struct address_space *mapping = page_mapping(page);
682 struct page **radix_pointer;
683
684 /*
685 * Avoid doing any of the following work if the page count
686 * indicates that the page is in use or truncate has removed
687 * the page.
688 */
689 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
690 return -EAGAIN;
691
692 /*
693 * Establish swap ptes for anonymous pages or destroy pte
694 * maps for files.
695 *
696 * In order to reestablish file backed mappings the fault handlers
697 * will take the radix tree_lock which may then be used to stop
698 * processses from accessing this page until the new page is ready.
699 *
700 * A process accessing via a swap pte (an anonymous page) will take a
701 * page_lock on the old page which will block the process until the
702 * migration attempt is complete. At that time the PageSwapCache bit
703 * will be examined. If the page was migrated then the PageSwapCache
704 * bit will be clear and the operation to retrieve the page will be
705 * retried which will find the new page in the radix tree. Then a new
706 * direct mapping may be generated based on the radix tree contents.
707 *
708 * If the page was not migrated then the PageSwapCache bit
709 * is still set and the operation may continue.
710 */
711 if (try_to_unmap(page, 1) == SWAP_FAIL)
712 /* A vma has VM_LOCKED set -> Permanent failure */
713 return -EPERM;
714
715 /*
716 * Give up if we were unable to remove all mappings.
717 */
718 if (page_mapcount(page))
719 return -EAGAIN;
720
721 write_lock_irq(&mapping->tree_lock);
722
723 radix_pointer = (struct page **)radix_tree_lookup_slot(
724 &mapping->page_tree,
725 page_index(page));
726
727 if (!page_mapping(page) || page_count(page) != nr_refs ||
728 *radix_pointer != page) {
729 write_unlock_irq(&mapping->tree_lock);
730 return -EAGAIN;
731 }
732
733 /*
734 * Now we know that no one else is looking at the page.
735 *
736 * Certain minimal information about a page must be available
737 * in order for other subsystems to properly handle the page if they
738 * find it through the radix tree update before we are finished
739 * copying the page.
740 */
741 get_page(newpage);
742 newpage->index = page->index;
743 newpage->mapping = page->mapping;
744 if (PageSwapCache(page)) {
745 SetPageSwapCache(newpage);
746 set_page_private(newpage, page_private(page));
747 }
748
749 *radix_pointer = newpage;
750 __put_page(page);
751 write_unlock_irq(&mapping->tree_lock);
752
753 return 0;
754}
755EXPORT_SYMBOL(migrate_page_remove_references);
756
757/*
758 * Copy the page to its new location
759 */
760void migrate_page_copy(struct page *newpage, struct page *page)
761{
762 copy_highpage(newpage, page);
763
764 if (PageError(page))
765 SetPageError(newpage);
766 if (PageReferenced(page))
767 SetPageReferenced(newpage);
768 if (PageUptodate(page))
769 SetPageUptodate(newpage);
770 if (PageActive(page))
771 SetPageActive(newpage);
772 if (PageChecked(page))
773 SetPageChecked(newpage);
774 if (PageMappedToDisk(page))
775 SetPageMappedToDisk(newpage);
776
777 if (PageDirty(page)) {
778 clear_page_dirty_for_io(page);
779 set_page_dirty(newpage);
780 }
781
782 ClearPageSwapCache(page);
783 ClearPageActive(page);
784 ClearPagePrivate(page);
785 set_page_private(page, 0);
786 page->mapping = NULL;
787
788 /*
789 * If any waiters have accumulated on the new page then
790 * wake them up.
791 */
792 if (PageWriteback(newpage))
793 end_page_writeback(newpage);
794}
795EXPORT_SYMBOL(migrate_page_copy);
796
797/*
798 * Common logic to directly migrate a single page suitable for
799 * pages that do not use PagePrivate.
800 *
801 * Pages are locked upon entry and exit.
802 */
803int migrate_page(struct page *newpage, struct page *page)
804{
805 int rc;
806
807 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
808
809 rc = migrate_page_remove_references(newpage, page, 2);
810
811 if (rc)
812 return rc;
813
814 migrate_page_copy(newpage, page);
815
816 /*
817 * Remove auxiliary swap entries and replace
818 * them with real ptes.
819 *
820 * Note that a real pte entry will allow processes that are not
821 * waiting on the page lock to use the new page via the page tables
822 * before the new page is unlocked.
823 */
824 remove_from_swap(newpage);
825 return 0;
826}
827EXPORT_SYMBOL(migrate_page);
828
829/*
830 * migrate_pages
831 *
832 * Two lists are passed to this function. The first list
833 * contains the pages isolated from the LRU to be migrated.
834 * The second list contains new pages that the pages isolated
835 * can be moved to. If the second list is NULL then all
836 * pages are swapped out.
837 *
838 * The function returns after 10 attempts or if no pages
839 * are movable anymore because to has become empty
840 * or no retryable pages exist anymore.
841 *
842 * Return: Number of pages not migrated when "to" ran empty.
843 */
844unsigned long migrate_pages(struct list_head *from, struct list_head *to,
845 struct list_head *moved, struct list_head *failed)
846{
847 unsigned long retry;
848 unsigned long nr_failed = 0;
849 int pass = 0;
850 struct page *page;
851 struct page *page2;
852 int swapwrite = current->flags & PF_SWAPWRITE;
853 int rc;
854
855 if (!swapwrite)
856 current->flags |= PF_SWAPWRITE;
857
858redo:
859 retry = 0;
860
861 list_for_each_entry_safe(page, page2, from, lru) {
862 struct page *newpage = NULL;
863 struct address_space *mapping;
864
865 cond_resched();
866
867 rc = 0;
868 if (page_count(page) == 1)
869 /* page was freed from under us. So we are done. */
870 goto next;
871
872 if (to && list_empty(to))
873 break;
874
875 /*
876 * Skip locked pages during the first two passes to give the
877 * functions holding the lock time to release the page. Later we
878 * use lock_page() to have a higher chance of acquiring the
879 * lock.
880 */
881 rc = -EAGAIN;
882 if (pass > 2)
883 lock_page(page);
884 else
885 if (TestSetPageLocked(page))
886 goto next;
887
888 /*
889 * Only wait on writeback if we have already done a pass where
890 * we we may have triggered writeouts for lots of pages.
891 */
892 if (pass > 0) {
893 wait_on_page_writeback(page);
894 } else {
895 if (PageWriteback(page))
896 goto unlock_page;
897 }
898
899 /*
900 * Anonymous pages must have swap cache references otherwise
901 * the information contained in the page maps cannot be
902 * preserved.
903 */
904 if (PageAnon(page) && !PageSwapCache(page)) {
905 if (!add_to_swap(page, GFP_KERNEL)) {
906 rc = -ENOMEM;
907 goto unlock_page;
908 }
909 }
910
911 if (!to) {
912 rc = swap_page(page);
913 goto next;
914 }
915
916 newpage = lru_to_page(to);
917 lock_page(newpage);
918
919 /*
920 * Pages are properly locked and writeback is complete.
921 * Try to migrate the page.
922 */
923 mapping = page_mapping(page);
924 if (!mapping)
925 goto unlock_both;
926
927 if (mapping->a_ops->migratepage) {
928 /*
929 * Most pages have a mapping and most filesystems
930 * should provide a migration function. Anonymous
931 * pages are part of swap space which also has its
932 * own migration function. This is the most common
933 * path for page migration.
934 */
935 rc = mapping->a_ops->migratepage(newpage, page);
936 goto unlock_both;
937 }
938
939 /*
940 * Default handling if a filesystem does not provide
941 * a migration function. We can only migrate clean
942 * pages so try to write out any dirty pages first.
943 */
944 if (PageDirty(page)) {
945 switch (pageout(page, mapping)) {
946 case PAGE_KEEP:
947 case PAGE_ACTIVATE:
948 goto unlock_both;
949
950 case PAGE_SUCCESS:
951 unlock_page(newpage);
952 goto next;
953
954 case PAGE_CLEAN:
955 ; /* try to migrate the page below */
956 }
957 }
958
959 /*
960 * Buffers are managed in a filesystem specific way.
961 * We must have no buffers or drop them.
962 */
963 if (!page_has_buffers(page) ||
964 try_to_release_page(page, GFP_KERNEL)) {
965 rc = migrate_page(newpage, page);
966 goto unlock_both;
967 }
968
969 /*
970 * On early passes with mapped pages simply
971 * retry. There may be a lock held for some
972 * buffers that may go away. Later
973 * swap them out.
974 */
975 if (pass > 4) {
976 /*
977 * Persistently unable to drop buffers..... As a
978 * measure of last resort we fall back to
979 * swap_page().
980 */
981 unlock_page(newpage);
982 newpage = NULL;
983 rc = swap_page(page);
984 goto next;
985 }
986
987unlock_both:
988 unlock_page(newpage);
989
990unlock_page:
991 unlock_page(page);
992
993next:
994 if (rc == -EAGAIN) {
995 retry++;
996 } else if (rc) {
997 /* Permanent failure */
998 list_move(&page->lru, failed);
999 nr_failed++;
1000 } else {
1001 if (newpage) {
1002 /* Successful migration. Return page to LRU */
1003 move_to_lru(newpage);
1004 }
1005 list_move(&page->lru, moved);
1006 }
1007 }
1008 if (retry && pass++ < 10)
1009 goto redo;
1010
1011 if (!swapwrite)
1012 current->flags &= ~PF_SWAPWRITE;
1013
1014 return nr_failed + retry;
1015}
1016
1017/*
1018 * Isolate one page from the LRU lists and put it on the
1019 * indicated list with elevated refcount.
1020 *
1021 * Result:
1022 * 0 = page not on LRU list
1023 * 1 = page removed from LRU list and added to the specified list.
1024 */
1025int isolate_lru_page(struct page *page)
1026{
1027 int ret = 0;
1028
1029 if (PageLRU(page)) {
1030 struct zone *zone = page_zone(page);
1031 spin_lock_irq(&zone->lru_lock);
1032 if (PageLRU(page)) {
1033 ret = 1;
1034 get_page(page);
1035 ClearPageLRU(page);
1036 if (PageActive(page))
1037 del_page_from_active_list(zone, page);
1038 else
1039 del_page_from_inactive_list(zone, page);
1040 }
1041 spin_unlock_irq(&zone->lru_lock);
1042 }
1043
1044 return ret;
1045}
1046#endif
1047
1048/* 561/*
1049 * zone->lru_lock is heavily contended. Some of the functions that 562 * zone->lru_lock is heavily contended. Some of the functions that
1050 * shrink the lists perform better by taking out a batch of pages 563 * shrink the lists perform better by taking out a batch of pages