diff options
author | Christoph Lameter <clameter@sgi.com> | 2006-06-23 05:03:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-23 10:42:51 -0400 |
commit | 04e62a29bf157ce1edd168f2b71b533c80d13628 (patch) | |
tree | 7f0d5a58eeef2c2e08da86dc7141a1ccd050a37d | |
parent | 442c9137de8d769053e81d325709dca72f0b5e44 (diff) |
[PATCH] More page migration: use migration entries for file pages
This implements the use of migration entries to preserve ptes of file backed
pages during migration. Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.
Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.
And another writepage() ugliness shows up. writepage() can drop the page
lock. Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/linux/swap.h | 15 | ||||
-rw-r--r-- | mm/migrate.c | 127 | ||||
-rw-r--r-- | mm/rmap.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 14 |
4 files changed, 124 insertions, 43 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 7cee73ef4f15..1cf234e8df55 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -186,20 +186,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); | |||
186 | extern int vm_swappiness; | 186 | extern int vm_swappiness; |
187 | extern int remove_mapping(struct address_space *mapping, struct page *page); | 187 | extern int remove_mapping(struct address_space *mapping, struct page *page); |
188 | 188 | ||
189 | /* possible outcome of pageout() */ | ||
190 | typedef enum { | ||
191 | /* failed to write page out, page is locked */ | ||
192 | PAGE_KEEP, | ||
193 | /* move page to the active list, page is locked */ | ||
194 | PAGE_ACTIVATE, | ||
195 | /* page has been sent to the disk successfully, page is unlocked */ | ||
196 | PAGE_SUCCESS, | ||
197 | /* page is clean and locked */ | ||
198 | PAGE_CLEAN, | ||
199 | } pageout_t; | ||
200 | |||
201 | extern pageout_t pageout(struct page *page, struct address_space *mapping); | ||
202 | |||
203 | #ifdef CONFIG_NUMA | 189 | #ifdef CONFIG_NUMA |
204 | extern int zone_reclaim_mode; | 190 | extern int zone_reclaim_mode; |
205 | extern int zone_reclaim_interval; | 191 | extern int zone_reclaim_interval; |
@@ -259,7 +245,6 @@ extern int remove_exclusive_swap_page(struct page *); | |||
259 | struct backing_dev_info; | 245 | struct backing_dev_info; |
260 | 246 | ||
261 | extern spinlock_t swap_lock; | 247 | extern spinlock_t swap_lock; |
262 | extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); | ||
263 | 248 | ||
264 | /* linux/mm/thrash.c */ | 249 | /* linux/mm/thrash.c */ |
265 | extern struct mm_struct * swap_token_mm; | 250 | extern struct mm_struct * swap_token_mm; |
diff --git a/mm/migrate.c b/mm/migrate.c index 96b9546e69e0..b5000d463893 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/topology.h> | 24 | #include <linux/topology.h> |
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
27 | #include <linux/writeback.h> | ||
27 | 28 | ||
28 | #include "internal.h" | 29 | #include "internal.h" |
29 | 30 | ||
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte) | |||
123 | /* | 124 | /* |
124 | * Restore a potential migration pte to a working pte entry | 125 | * Restore a potential migration pte to a working pte entry |
125 | */ | 126 | */ |
126 | static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, | 127 | static void remove_migration_pte(struct vm_area_struct *vma, |
127 | struct page *old, struct page *new) | 128 | struct page *old, struct page *new) |
128 | { | 129 | { |
129 | struct mm_struct *mm = vma->vm_mm; | 130 | struct mm_struct *mm = vma->vm_mm; |
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, | |||
133 | pmd_t *pmd; | 134 | pmd_t *pmd; |
134 | pte_t *ptep, pte; | 135 | pte_t *ptep, pte; |
135 | spinlock_t *ptl; | 136 | spinlock_t *ptl; |
137 | unsigned long addr = page_address_in_vma(new, vma); | ||
138 | |||
139 | if (addr == -EFAULT) | ||
140 | return; | ||
136 | 141 | ||
137 | pgd = pgd_offset(mm, addr); | 142 | pgd = pgd_offset(mm, addr); |
138 | if (!pgd_present(*pgd)) | 143 | if (!pgd_present(*pgd)) |
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, | |||
169 | if (is_write_migration_entry(entry)) | 174 | if (is_write_migration_entry(entry)) |
170 | pte = pte_mkwrite(pte); | 175 | pte = pte_mkwrite(pte); |
171 | set_pte_at(mm, addr, ptep, pte); | 176 | set_pte_at(mm, addr, ptep, pte); |
172 | page_add_anon_rmap(new, vma, addr); | 177 | |
178 | if (PageAnon(new)) | ||
179 | page_add_anon_rmap(new, vma, addr); | ||
180 | else | ||
181 | page_add_file_rmap(new); | ||
182 | |||
183 | /* No need to invalidate - it was non-present before */ | ||
184 | update_mmu_cache(vma, addr, pte); | ||
185 | lazy_mmu_prot_update(pte); | ||
186 | |||
173 | out: | 187 | out: |
174 | pte_unmap_unlock(ptep, ptl); | 188 | pte_unmap_unlock(ptep, ptl); |
175 | } | 189 | } |
176 | 190 | ||
177 | /* | 191 | /* |
178 | * Get rid of all migration entries and replace them by | 192 | * Note that remove_file_migration_ptes will only work on regular mappings, |
179 | * references to the indicated page. | 193 | * Nonlinear mappings do not use migration entries. |
180 | * | 194 | */ |
195 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
196 | { | ||
197 | struct vm_area_struct *vma; | ||
198 | struct address_space *mapping = page_mapping(new); | ||
199 | struct prio_tree_iter iter; | ||
200 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
201 | |||
202 | if (!mapping) | ||
203 | return; | ||
204 | |||
205 | spin_lock(&mapping->i_mmap_lock); | ||
206 | |||
207 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
208 | remove_migration_pte(vma, old, new); | ||
209 | |||
210 | spin_unlock(&mapping->i_mmap_lock); | ||
211 | } | ||
212 | |||
213 | /* | ||
181 | * Must hold mmap_sem lock on at least one of the vmas containing | 214 | * Must hold mmap_sem lock on at least one of the vmas containing |
182 | * the page so that the anon_vma cannot vanish. | 215 | * the page so that the anon_vma cannot vanish. |
183 | */ | 216 | */ |
184 | static void remove_migration_ptes(struct page *old, struct page *new) | 217 | static void remove_anon_migration_ptes(struct page *old, struct page *new) |
185 | { | 218 | { |
186 | struct anon_vma *anon_vma; | 219 | struct anon_vma *anon_vma; |
187 | struct vm_area_struct *vma; | 220 | struct vm_area_struct *vma; |
@@ -199,13 +232,24 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
199 | spin_lock(&anon_vma->lock); | 232 | spin_lock(&anon_vma->lock); |
200 | 233 | ||
201 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | 234 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) |
202 | remove_migration_pte(vma, page_address_in_vma(new, vma), | 235 | remove_migration_pte(vma, old, new); |
203 | old, new); | ||
204 | 236 | ||
205 | spin_unlock(&anon_vma->lock); | 237 | spin_unlock(&anon_vma->lock); |
206 | } | 238 | } |
207 | 239 | ||
208 | /* | 240 | /* |
241 | * Get rid of all migration entries and replace them by | ||
242 | * references to the indicated page. | ||
243 | */ | ||
244 | static void remove_migration_ptes(struct page *old, struct page *new) | ||
245 | { | ||
246 | if (PageAnon(new)) | ||
247 | remove_anon_migration_ptes(old, new); | ||
248 | else | ||
249 | remove_file_migration_ptes(old, new); | ||
250 | } | ||
251 | |||
252 | /* | ||
209 | * Something used the pte of a page under migration. We need to | 253 | * Something used the pte of a page under migration. We need to |
210 | * get to the page and wait until migration is finished. | 254 | * get to the page and wait until migration is finished. |
211 | * When we return from this function the fault will be retried. | 255 | * When we return from this function the fault will be retried. |
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping, | |||
424 | } | 468 | } |
425 | EXPORT_SYMBOL(buffer_migrate_page); | 469 | EXPORT_SYMBOL(buffer_migrate_page); |
426 | 470 | ||
427 | static int fallback_migrate_page(struct address_space *mapping, | 471 | /* |
428 | struct page *newpage, struct page *page) | 472 | * Writeback a page to clean the dirty state |
473 | */ | ||
474 | static int writeout(struct address_space *mapping, struct page *page) | ||
429 | { | 475 | { |
476 | struct writeback_control wbc = { | ||
477 | .sync_mode = WB_SYNC_NONE, | ||
478 | .nr_to_write = 1, | ||
479 | .range_start = 0, | ||
480 | .range_end = LLONG_MAX, | ||
481 | .nonblocking = 1, | ||
482 | .for_reclaim = 1 | ||
483 | }; | ||
484 | int rc; | ||
485 | |||
486 | if (!mapping->a_ops->writepage) | ||
487 | /* No write method for the address space */ | ||
488 | return -EINVAL; | ||
489 | |||
490 | if (!clear_page_dirty_for_io(page)) | ||
491 | /* Someone else already triggered a write */ | ||
492 | return -EAGAIN; | ||
493 | |||
430 | /* | 494 | /* |
431 | * Default handling if a filesystem does not provide | 495 | * A dirty page may imply that the underlying filesystem has |
432 | * a migration function. We can only migrate clean | 496 | * the page on some queue. So the page must be clean for |
433 | * pages so try to write out any dirty pages first. | 497 | * migration. Writeout may mean we loose the lock and the |
498 | * page state is no longer what we checked for earlier. | ||
499 | * At this point we know that the migration attempt cannot | ||
500 | * be successful. | ||
434 | */ | 501 | */ |
435 | if (PageDirty(page)) { | 502 | remove_migration_ptes(page, page); |
436 | switch (pageout(page, mapping)) { | ||
437 | case PAGE_KEEP: | ||
438 | case PAGE_ACTIVATE: | ||
439 | return -EAGAIN; | ||
440 | 503 | ||
441 | case PAGE_SUCCESS: | 504 | rc = mapping->a_ops->writepage(page, &wbc); |
442 | /* Relock since we lost the lock */ | 505 | if (rc < 0) |
443 | lock_page(page); | 506 | /* I/O Error writing */ |
444 | /* Must retry since page state may have changed */ | 507 | return -EIO; |
445 | return -EAGAIN; | ||
446 | 508 | ||
447 | case PAGE_CLEAN: | 509 | if (rc != AOP_WRITEPAGE_ACTIVATE) |
448 | ; /* try to migrate the page below */ | 510 | /* unlocked. Relock */ |
449 | } | 511 | lock_page(page); |
450 | } | 512 | |
513 | return -EAGAIN; | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Default handling if a filesystem does not provide a migration function. | ||
518 | */ | ||
519 | static int fallback_migrate_page(struct address_space *mapping, | ||
520 | struct page *newpage, struct page *page) | ||
521 | { | ||
522 | if (PageDirty(page)) | ||
523 | return writeout(mapping, page); | ||
451 | 524 | ||
452 | /* | 525 | /* |
453 | * Buffers may be managed in a filesystem specific way. | 526 | * Buffers may be managed in a filesystem specific way. |
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
596 | spin_unlock(&mmlist_lock); | 596 | spin_unlock(&mmlist_lock); |
597 | } | 597 | } |
598 | dec_mm_counter(mm, anon_rss); | 598 | dec_mm_counter(mm, anon_rss); |
599 | #ifdef CONFIG_MIGRATION | ||
599 | } else { | 600 | } else { |
600 | /* | 601 | /* |
601 | * Store the pfn of the page in a special migration | 602 | * Store the pfn of the page in a special migration |
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
604 | */ | 605 | */ |
605 | BUG_ON(!migration); | 606 | BUG_ON(!migration); |
606 | entry = make_migration_entry(page, pte_write(pteval)); | 607 | entry = make_migration_entry(page, pte_write(pteval)); |
608 | #endif | ||
607 | } | 609 | } |
608 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 610 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
609 | BUG_ON(pte_file(*pte)); | 611 | BUG_ON(pte_file(*pte)); |
610 | } else | 612 | } else |
613 | #ifdef CONFIG_MIGRATION | ||
614 | if (migration) { | ||
615 | /* Establish migration entry for a file page */ | ||
616 | swp_entry_t entry; | ||
617 | entry = make_migration_entry(page, pte_write(pteval)); | ||
618 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | ||
619 | } else | ||
620 | #endif | ||
611 | dec_mm_counter(mm, file_rss); | 621 | dec_mm_counter(mm, file_rss); |
612 | 622 | ||
623 | |||
613 | page_remove_rmap(page); | 624 | page_remove_rmap(page); |
614 | page_cache_release(page); | 625 | page_cache_release(page); |
615 | 626 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index bc5d4f43036c..71a02e295037 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping, | |||
290 | unlock_page(page); | 290 | unlock_page(page); |
291 | } | 291 | } |
292 | 292 | ||
293 | /* possible outcome of pageout() */ | ||
294 | typedef enum { | ||
295 | /* failed to write page out, page is locked */ | ||
296 | PAGE_KEEP, | ||
297 | /* move page to the active list, page is locked */ | ||
298 | PAGE_ACTIVATE, | ||
299 | /* page has been sent to the disk successfully, page is unlocked */ | ||
300 | PAGE_SUCCESS, | ||
301 | /* page is clean and locked */ | ||
302 | PAGE_CLEAN, | ||
303 | } pageout_t; | ||
304 | |||
293 | /* | 305 | /* |
294 | * pageout is called by shrink_page_list() for each dirty page. | 306 | * pageout is called by shrink_page_list() for each dirty page. |
295 | * Calls ->writepage(). | 307 | * Calls ->writepage(). |
296 | */ | 308 | */ |
297 | pageout_t pageout(struct page *page, struct address_space *mapping) | 309 | static pageout_t pageout(struct page *page, struct address_space *mapping) |
298 | { | 310 | { |
299 | /* | 311 | /* |
300 | * If the page is dirty, only perform writeback if that write | 312 | * If the page is dirty, only perform writeback if that write |