aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-06-23 05:03:38 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:51 -0400
commit04e62a29bf157ce1edd168f2b71b533c80d13628 (patch)
tree7f0d5a58eeef2c2e08da86dc7141a1ccd050a37d
parent442c9137de8d769053e81d325709dca72f0b5e44 (diff)
[PATCH] More page migration: use migration entries for file pages
This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/swap.h15
-rw-r--r--mm/migrate.c127
-rw-r--r--mm/rmap.c11
-rw-r--r--mm/vmscan.c14
4 files changed, 124 insertions, 43 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7cee73ef4f15..1cf234e8df55 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -186,20 +186,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
186extern int vm_swappiness; 186extern int vm_swappiness;
187extern int remove_mapping(struct address_space *mapping, struct page *page); 187extern int remove_mapping(struct address_space *mapping, struct page *page);
188 188
189/* possible outcome of pageout() */
190typedef enum {
191 /* failed to write page out, page is locked */
192 PAGE_KEEP,
193 /* move page to the active list, page is locked */
194 PAGE_ACTIVATE,
195 /* page has been sent to the disk successfully, page is unlocked */
196 PAGE_SUCCESS,
197 /* page is clean and locked */
198 PAGE_CLEAN,
199} pageout_t;
200
201extern pageout_t pageout(struct page *page, struct address_space *mapping);
202
203#ifdef CONFIG_NUMA 189#ifdef CONFIG_NUMA
204extern int zone_reclaim_mode; 190extern int zone_reclaim_mode;
205extern int zone_reclaim_interval; 191extern int zone_reclaim_interval;
@@ -259,7 +245,6 @@ extern int remove_exclusive_swap_page(struct page *);
259struct backing_dev_info; 245struct backing_dev_info;
260 246
261extern spinlock_t swap_lock; 247extern spinlock_t swap_lock;
262extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
263 248
264/* linux/mm/thrash.c */ 249/* linux/mm/thrash.c */
265extern struct mm_struct * swap_token_mm; 250extern struct mm_struct * swap_token_mm;
diff --git a/mm/migrate.c b/mm/migrate.c
index 96b9546e69e0..b5000d463893 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -24,6 +24,7 @@
24#include <linux/topology.h> 24#include <linux/topology.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/cpuset.h> 26#include <linux/cpuset.h>
27#include <linux/writeback.h>
27 28
28#include "internal.h" 29#include "internal.h"
29 30
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte)
123/* 124/*
124 * Restore a potential migration pte to a working pte entry 125 * Restore a potential migration pte to a working pte entry
125 */ 126 */
126static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, 127static void remove_migration_pte(struct vm_area_struct *vma,
127 struct page *old, struct page *new) 128 struct page *old, struct page *new)
128{ 129{
129 struct mm_struct *mm = vma->vm_mm; 130 struct mm_struct *mm = vma->vm_mm;
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
133 pmd_t *pmd; 134 pmd_t *pmd;
134 pte_t *ptep, pte; 135 pte_t *ptep, pte;
135 spinlock_t *ptl; 136 spinlock_t *ptl;
137 unsigned long addr = page_address_in_vma(new, vma);
138
139 if (addr == -EFAULT)
140 return;
136 141
137 pgd = pgd_offset(mm, addr); 142 pgd = pgd_offset(mm, addr);
138 if (!pgd_present(*pgd)) 143 if (!pgd_present(*pgd))
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
169 if (is_write_migration_entry(entry)) 174 if (is_write_migration_entry(entry))
170 pte = pte_mkwrite(pte); 175 pte = pte_mkwrite(pte);
171 set_pte_at(mm, addr, ptep, pte); 176 set_pte_at(mm, addr, ptep, pte);
172 page_add_anon_rmap(new, vma, addr); 177
178 if (PageAnon(new))
179 page_add_anon_rmap(new, vma, addr);
180 else
181 page_add_file_rmap(new);
182
183 /* No need to invalidate - it was non-present before */
184 update_mmu_cache(vma, addr, pte);
185 lazy_mmu_prot_update(pte);
186
173out: 187out:
174 pte_unmap_unlock(ptep, ptl); 188 pte_unmap_unlock(ptep, ptl);
175} 189}
176 190
177/* 191/*
178 * Get rid of all migration entries and replace them by 192 * Note that remove_file_migration_ptes will only work on regular mappings,
179 * references to the indicated page. 193 * Nonlinear mappings do not use migration entries.
180 * 194 */
195static void remove_file_migration_ptes(struct page *old, struct page *new)
196{
197 struct vm_area_struct *vma;
198 struct address_space *mapping = page_mapping(new);
199 struct prio_tree_iter iter;
200 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
201
202 if (!mapping)
203 return;
204
205 spin_lock(&mapping->i_mmap_lock);
206
207 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
208 remove_migration_pte(vma, old, new);
209
210 spin_unlock(&mapping->i_mmap_lock);
211}
212
213/*
181 * Must hold mmap_sem lock on at least one of the vmas containing 214 * Must hold mmap_sem lock on at least one of the vmas containing
182 * the page so that the anon_vma cannot vanish. 215 * the page so that the anon_vma cannot vanish.
183 */ 216 */
184static void remove_migration_ptes(struct page *old, struct page *new) 217static void remove_anon_migration_ptes(struct page *old, struct page *new)
185{ 218{
186 struct anon_vma *anon_vma; 219 struct anon_vma *anon_vma;
187 struct vm_area_struct *vma; 220 struct vm_area_struct *vma;
@@ -199,13 +232,24 @@ static void remove_migration_ptes(struct page *old, struct page *new)
199 spin_lock(&anon_vma->lock); 232 spin_lock(&anon_vma->lock);
200 233
201 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 234 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
202 remove_migration_pte(vma, page_address_in_vma(new, vma), 235 remove_migration_pte(vma, old, new);
203 old, new);
204 236
205 spin_unlock(&anon_vma->lock); 237 spin_unlock(&anon_vma->lock);
206} 238}
207 239
208/* 240/*
241 * Get rid of all migration entries and replace them by
242 * references to the indicated page.
243 */
244static void remove_migration_ptes(struct page *old, struct page *new)
245{
246 if (PageAnon(new))
247 remove_anon_migration_ptes(old, new);
248 else
249 remove_file_migration_ptes(old, new);
250}
251
252/*
209 * Something used the pte of a page under migration. We need to 253 * Something used the pte of a page under migration. We need to
210 * get to the page and wait until migration is finished. 254 * get to the page and wait until migration is finished.
211 * When we return from this function the fault will be retried. 255 * When we return from this function the fault will be retried.
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping,
424} 468}
425EXPORT_SYMBOL(buffer_migrate_page); 469EXPORT_SYMBOL(buffer_migrate_page);
426 470
427static int fallback_migrate_page(struct address_space *mapping, 471/*
428 struct page *newpage, struct page *page) 472 * Writeback a page to clean the dirty state
473 */
474static int writeout(struct address_space *mapping, struct page *page)
429{ 475{
476 struct writeback_control wbc = {
477 .sync_mode = WB_SYNC_NONE,
478 .nr_to_write = 1,
479 .range_start = 0,
480 .range_end = LLONG_MAX,
481 .nonblocking = 1,
482 .for_reclaim = 1
483 };
484 int rc;
485
486 if (!mapping->a_ops->writepage)
487 /* No write method for the address space */
488 return -EINVAL;
489
490 if (!clear_page_dirty_for_io(page))
491 /* Someone else already triggered a write */
492 return -EAGAIN;
493
430 /* 494 /*
431 * Default handling if a filesystem does not provide 495 * A dirty page may imply that the underlying filesystem has
432 * a migration function. We can only migrate clean 496 * the page on some queue. So the page must be clean for
433 * pages so try to write out any dirty pages first. 497 * migration. Writeout may mean we loose the lock and the
498 * page state is no longer what we checked for earlier.
499 * At this point we know that the migration attempt cannot
500 * be successful.
434 */ 501 */
435 if (PageDirty(page)) { 502 remove_migration_ptes(page, page);
436 switch (pageout(page, mapping)) {
437 case PAGE_KEEP:
438 case PAGE_ACTIVATE:
439 return -EAGAIN;
440 503
441 case PAGE_SUCCESS: 504 rc = mapping->a_ops->writepage(page, &wbc);
442 /* Relock since we lost the lock */ 505 if (rc < 0)
443 lock_page(page); 506 /* I/O Error writing */
444 /* Must retry since page state may have changed */ 507 return -EIO;
445 return -EAGAIN;
446 508
447 case PAGE_CLEAN: 509 if (rc != AOP_WRITEPAGE_ACTIVATE)
448 ; /* try to migrate the page below */ 510 /* unlocked. Relock */
449 } 511 lock_page(page);
450 } 512
513 return -EAGAIN;
514}
515
516/*
517 * Default handling if a filesystem does not provide a migration function.
518 */
519static int fallback_migrate_page(struct address_space *mapping,
520 struct page *newpage, struct page *page)
521{
522 if (PageDirty(page))
523 return writeout(mapping, page);
451 524
452 /* 525 /*
453 * Buffers may be managed in a filesystem specific way. 526 * Buffers may be managed in a filesystem specific way.
diff --git a/mm/rmap.c b/mm/rmap.c
index 05d6d73a692d..882a85826bb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
596 spin_unlock(&mmlist_lock); 596 spin_unlock(&mmlist_lock);
597 } 597 }
598 dec_mm_counter(mm, anon_rss); 598 dec_mm_counter(mm, anon_rss);
599#ifdef CONFIG_MIGRATION
599 } else { 600 } else {
600 /* 601 /*
601 * Store the pfn of the page in a special migration 602 * Store the pfn of the page in a special migration
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
604 */ 605 */
605 BUG_ON(!migration); 606 BUG_ON(!migration);
606 entry = make_migration_entry(page, pte_write(pteval)); 607 entry = make_migration_entry(page, pte_write(pteval));
608#endif
607 } 609 }
608 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 610 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
609 BUG_ON(pte_file(*pte)); 611 BUG_ON(pte_file(*pte));
610 } else 612 } else
613#ifdef CONFIG_MIGRATION
614 if (migration) {
615 /* Establish migration entry for a file page */
616 swp_entry_t entry;
617 entry = make_migration_entry(page, pte_write(pteval));
618 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
619 } else
620#endif
611 dec_mm_counter(mm, file_rss); 621 dec_mm_counter(mm, file_rss);
612 622
623
613 page_remove_rmap(page); 624 page_remove_rmap(page);
614 page_cache_release(page); 625 page_cache_release(page);
615 626
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc5d4f43036c..71a02e295037 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
290 unlock_page(page); 290 unlock_page(page);
291} 291}
292 292
293/* possible outcome of pageout() */
294typedef enum {
295 /* failed to write page out, page is locked */
296 PAGE_KEEP,
297 /* move page to the active list, page is locked */
298 PAGE_ACTIVATE,
299 /* page has been sent to the disk successfully, page is unlocked */
300 PAGE_SUCCESS,
301 /* page is clean and locked */
302 PAGE_CLEAN,
303} pageout_t;
304
293/* 305/*
294 * pageout is called by shrink_page_list() for each dirty page. 306 * pageout is called by shrink_page_list() for each dirty page.
295 * Calls ->writepage(). 307 * Calls ->writepage().
296 */ 308 */
297pageout_t pageout(struct page *page, struct address_space *mapping) 309static pageout_t pageout(struct page *page, struct address_space *mapping)
298{ 310{
299 /* 311 /*
300 * If the page is dirty, only perform writeback if that write 312 * If the page is dirty, only perform writeback if that write