[PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-06-23 05:03:38 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-06-23 10:42:51 -0400
commit: 04e62a29bf157ce1edd168f2b71b533c80d13628 (patch)
tree: 7f0d5a58eeef2c2e08da86dc7141a1ccd050a37d /mm
parent: 442c9137de8d769053e81d325709dca72f0b5e44 (diff)
3 files changed, 124 insertions, 28 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index 96b9546e69e0..b5000d463893 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -24,6 +24,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/writeback.h>
 #include "internal.h"
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte)
 /*
 * Restore a potential migration pte to a working pte entry
 */
-static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+static void remove_migration_pte(struct vm_area_struct *vma,
                struct page *old, struct page *new)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
        pmd_t *pmd;
        pte_t *ptep, pte;
        spinlock_t *ptl;
+        unsigned long addr = page_address_in_vma(new, vma);
+        if (addr == -EFAULT)
+                return;
        pgd = pgd_offset(mm, addr);
        if (!pgd_present(*pgd))
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
        set_pte_at(mm, addr, ptep, pte);
-        page_add_anon_rmap(new, vma, addr);
+        if (PageAnon(new))
+                page_add_anon_rmap(new, vma, addr);
+        else
+                page_add_file_rmap(new);
+        /* No need to invalidate - it was non-present before */
+        update_mmu_cache(vma, addr, pte);
+        lazy_mmu_prot_update(pte);
 out:
        pte_unmap_unlock(ptep, ptl);
 }
 /*
- * Get rid of all migration entries and replace them by
+ * Note that remove_file_migration_ptes will only work on regular mappings,
- * references to the indicated page.
+ * Nonlinear mappings do not use migration entries.
- *
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+        struct vm_area_struct *vma;
+        struct address_space *mapping = page_mapping(new);
+        struct prio_tree_iter iter;
+        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        if (!mapping)
+                return;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+                remove_migration_pte(vma, old, new);
+        spin_unlock(&mapping->i_mmap_lock);
+}
+/*
 * Must hold mmap_sem lock on at least one of the vmas containing
 * the page so that the anon_vma cannot vanish.
 */
-static void remove_migration_ptes(struct page *old, struct page *new)
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
@@ -199,13 +232,24 @@ static void remove_migration_ptes(struct page *old, struct page *new)
        spin_lock(&anon_vma->lock);
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-                remove_migration_pte(vma, page_address_in_vma(new, vma),
+                remove_migration_pte(vma, old, new);
-                                        old, new);
        spin_unlock(&anon_vma->lock);
 }
 /*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+        if (PageAnon(new))
+                remove_anon_migration_ptes(old, new);
+        else
+                remove_file_migration_ptes(old, new);
+}
+/*
 * Something used the pte of a page under migration. We need to
 * get to the page and wait until migration is finished.
 * When we return from this function the fault will be retried.
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(buffer_migrate_page);
-static int fallback_migrate_page(struct address_space *mapping,
+/*
-        struct page *newpage, struct page *page)
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
 {
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_NONE,
+                .nr_to_write = 1,
+                .range_start = 0,
+                .range_end = LLONG_MAX,
+                .nonblocking = 1,
+                .for_reclaim = 1
+        };
+        int rc;
+        if (!mapping->a_ops->writepage)
+                /* No write method for the address space */
+                return -EINVAL;
+        if (!clear_page_dirty_for_io(page))
+                /* Someone else already triggered a write */
+                return -EAGAIN;
        /*
-         * Default handling if a filesystem does not provide
+         * A dirty page may imply that the underlying filesystem has
-         * a migration function. We can only migrate clean
+         * the page on some queue. So the page must be clean for
-         * pages so try to write out any dirty pages first.
+         * migration. Writeout may mean we loose the lock and the
+         * page state is no longer what we checked for earlier.
+         * At this point we know that the migration attempt cannot
+         * be successful.
         */
-        if (PageDirty(page)) {
+        remove_migration_ptes(page, page);
-                switch (pageout(page, mapping)) {
-                case PAGE_KEEP:
-                case PAGE_ACTIVATE:
-                        return -EAGAIN;
-                case PAGE_SUCCESS:
+        rc = mapping->a_ops->writepage(page, &wbc);
-                        /* Relock since we lost the lock */
+        if (rc < 0)
-                        lock_page(page);
+                /* I/O Error writing */
-                        /* Must retry since page state may have changed */
+                return -EIO;
-                        return -EAGAIN;
-                case PAGE_CLEAN:
+        if (rc != AOP_WRITEPAGE_ACTIVATE)
-                        ; /* try to migrate the page below */
+                /* unlocked. Relock */
-                }
+                lock_page(page);
-        }
+        return -EAGAIN;
+}
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+        struct page *newpage, struct page *page)
+{
+        if (PageDirty(page))
+                return writeout(mapping, page);
        /*
         * Buffers may be managed in a filesystem specific way.
diff --git a/mm/rmap.c b/mm/rmap.c
index 05d6d73a692d..882a85826bb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
                } else {
                        /*
                         * Store the pfn of the page in a special migration
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         */
                        BUG_ON(!migration);
                        entry = make_migration_entry(page, pte_write(pteval));
+#endif
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
        } else
+#ifdef CONFIG_MIGRATION
+        if (migration) {
+                /* Establish migration entry for a file page */
+                swp_entry_t entry;
+                entry = make_migration_entry(page, pte_write(pteval));
+                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+        } else
+#endif
                dec_mm_counter(mm, file_rss);
        page_remove_rmap(page);
        page_cache_release(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc5d4f43036c..71a02e295037 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
+/* possible outcome of pageout() */
+typedef enum {
+        /* failed to write page out, page is locked */
+        PAGE_KEEP,
+        /* move page to the active list, page is locked */
+        PAGE_ACTIVATE,
+        /* page has been sent to the disk successfully, page is unlocked */
+        PAGE_SUCCESS,
+        /* page is clean and locked */
+        PAGE_CLEAN,
+} pageout_t;
 /*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
author	Christoph Lameter <clameter@sgi.com>	2006-06-23 05:03:38 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-06-23 10:42:51 -0400
commit	04e62a29bf157ce1edd168f2b71b533c80d13628 (patch)
tree	7f0d5a58eeef2c2e08da86dc7141a1ccd050a37d /mm
parent	442c9137de8d769053e81d325709dca72f0b5e44 (diff)

diff --git a/mm/migrate.c b/mm/migrate.c index 96b9546e69e0..b5000d463893 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -24,6 +24,7 @@
24	#include <linux/topology.h>	24	#include <linux/topology.h>
25	#include <linux/cpu.h>	25	#include <linux/cpu.h>
26	#include <linux/cpuset.h>	26	#include <linux/cpuset.h>
		27	#include <linux/writeback.h>
27		28
28	#include "internal.h"	29	#include "internal.h"
29		30
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte)
123	/*	124	/*
124	* Restore a potential migration pte to a working pte entry	125	* Restore a potential migration pte to a working pte entry
125	*/	126	*/
126	static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,	127	static void remove_migration_pte(struct vm_area_struct *vma,
127	struct page old, struct page new)	128	struct page old, struct page new)
128	{	129	{
129	struct mm_struct *mm = vma->vm_mm;	130	struct mm_struct *mm = vma->vm_mm;
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
133	pmd_t *pmd;	134	pmd_t *pmd;
134	pte_t *ptep, pte;	135	pte_t *ptep, pte;
135	spinlock_t *ptl;	136	spinlock_t *ptl;
		137	unsigned long addr = page_address_in_vma(new, vma);
		138
		139	if (addr == -EFAULT)
		140	return;
136		141
137	pgd = pgd_offset(mm, addr);	142	pgd = pgd_offset(mm, addr);
138	if (!pgd_present(*pgd))	143	if (!pgd_present(*pgd))
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
169	if (is_write_migration_entry(entry))	174	if (is_write_migration_entry(entry))
170	pte = pte_mkwrite(pte);	175	pte = pte_mkwrite(pte);
171	set_pte_at(mm, addr, ptep, pte);	176	set_pte_at(mm, addr, ptep, pte);
172	page_add_anon_rmap(new, vma, addr);	177
		178	if (PageAnon(new))
		179	page_add_anon_rmap(new, vma, addr);
		180	else
		181	page_add_file_rmap(new);
		182
		183	/* No need to invalidate - it was non-present before */
		184	update_mmu_cache(vma, addr, pte);
		185	lazy_mmu_prot_update(pte);
		186
173	out:	187	out:
174	pte_unmap_unlock(ptep, ptl);	188	pte_unmap_unlock(ptep, ptl);
175	}	189	}
176		190
177	/*	191	/*
178	* Get rid of all migration entries and replace them by	192	* Note that remove_file_migration_ptes will only work on regular mappings,
179	* references to the indicated page.	193	* Nonlinear mappings do not use migration entries.
180	*	194	*/
		195	static void remove_file_migration_ptes(struct page old, struct page new)
		196	{
		197	struct vm_area_struct *vma;
		198	struct address_space *mapping = page_mapping(new);
		199	struct prio_tree_iter iter;
		200	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
		201
		202	if (!mapping)
		203	return;
		204
		205	spin_lock(&mapping->i_mmap_lock);
		206
		207	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
		208	remove_migration_pte(vma, old, new);
		209
		210	spin_unlock(&mapping->i_mmap_lock);
		211	}
		212
		213	/*
181	* Must hold mmap_sem lock on at least one of the vmas containing	214	* Must hold mmap_sem lock on at least one of the vmas containing
182	* the page so that the anon_vma cannot vanish.	215	* the page so that the anon_vma cannot vanish.
183	*/	216	*/
184	static void remove_migration_ptes(struct page old, struct page new)	217	static void remove_anon_migration_ptes(struct page old, struct page new)
185	{	218	{
186	struct anon_vma *anon_vma;	219	struct anon_vma *anon_vma;
187	struct vm_area_struct *vma;	220	struct vm_area_struct *vma;
@@ -199,13 +232,24 @@ static void remove_migration_ptes(struct page old, struct page new)
199	spin_lock(&anon_vma->lock);	232	spin_lock(&anon_vma->lock);
200		233
201	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)	234	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
202	remove_migration_pte(vma, page_address_in_vma(new, vma),	235	remove_migration_pte(vma, old, new);
203	old, new);
204		236
205	spin_unlock(&anon_vma->lock);	237	spin_unlock(&anon_vma->lock);
206	}	238	}
207		239
208	/*	240	/*
		241	* Get rid of all migration entries and replace them by
		242	* references to the indicated page.
		243	*/
		244	static void remove_migration_ptes(struct page old, struct page new)
		245	{
		246	if (PageAnon(new))
		247	remove_anon_migration_ptes(old, new);
		248	else
		249	remove_file_migration_ptes(old, new);
		250	}
		251
		252	/*
209	* Something used the pte of a page under migration. We need to	253	* Something used the pte of a page under migration. We need to
210	* get to the page and wait until migration is finished.	254	* get to the page and wait until migration is finished.
211	* When we return from this function the fault will be retried.	255	* When we return from this function the fault will be retried.
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping,
424	}	468	}
425	EXPORT_SYMBOL(buffer_migrate_page);	469	EXPORT_SYMBOL(buffer_migrate_page);
426		470
427	static int fallback_migrate_page(struct address_space *mapping,	471	/*
428	struct page newpage, struct page page)	472	* Writeback a page to clean the dirty state
		473	*/
		474	static int writeout(struct address_space mapping, struct page page)
429	{	475	{
		476	struct writeback_control wbc = {
		477	.sync_mode = WB_SYNC_NONE,
		478	.nr_to_write = 1,
		479	.range_start = 0,
		480	.range_end = LLONG_MAX,
		481	.nonblocking = 1,
		482	.for_reclaim = 1
		483	};
		484	int rc;
		485
		486	if (!mapping->a_ops->writepage)
		487	/* No write method for the address space */
		488	return -EINVAL;
		489
		490	if (!clear_page_dirty_for_io(page))
		491	/* Someone else already triggered a write */
		492	return -EAGAIN;
		493
430	/*	494	/*
431	* Default handling if a filesystem does not provide	495	* A dirty page may imply that the underlying filesystem has
432	* a migration function. We can only migrate clean	496	* the page on some queue. So the page must be clean for
433	* pages so try to write out any dirty pages first.	497	* migration. Writeout may mean we loose the lock and the
		498	* page state is no longer what we checked for earlier.
		499	* At this point we know that the migration attempt cannot
		500	* be successful.
434	*/	501	*/
435	if (PageDirty(page)) {	502	remove_migration_ptes(page, page);
436	switch (pageout(page, mapping)) {
437	case PAGE_KEEP:
438	case PAGE_ACTIVATE:
439	return -EAGAIN;
440		503
441	case PAGE_SUCCESS:	504	rc = mapping->a_ops->writepage(page, &wbc);
442	/* Relock since we lost the lock */	505	if (rc < 0)
443	lock_page(page);	506	/* I/O Error writing */
444	/* Must retry since page state may have changed */	507	return -EIO;
445	return -EAGAIN;
446		508
447	case PAGE_CLEAN:	509	if (rc != AOP_WRITEPAGE_ACTIVATE)
448	; /* try to migrate the page below */	510	/* unlocked. Relock */
449	}	511	lock_page(page);
450	}	512
		513	return -EAGAIN;
		514	}
		515
		516	/*
		517	* Default handling if a filesystem does not provide a migration function.
		518	*/
		519	static int fallback_migrate_page(struct address_space *mapping,
		520	struct page newpage, struct page page)
		521	{
		522	if (PageDirty(page))
		523	return writeout(mapping, page);
451		524
452	/*	525	/*
453	* Buffers may be managed in a filesystem specific way.	526	* Buffers may be managed in a filesystem specific way.


diff --git a/mm/rmap.c b/mm/rmap.c index 05d6d73a692d..882a85826bb2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page page, struct vm_area_struct vma,
596	spin_unlock(&mmlist_lock);	596	spin_unlock(&mmlist_lock);
597	}	597	}
598	dec_mm_counter(mm, anon_rss);	598	dec_mm_counter(mm, anon_rss);
		599	#ifdef CONFIG_MIGRATION
599	} else {	600	} else {
600	/*	601	/*
601	* Store the pfn of the page in a special migration	602	* Store the pfn of the page in a special migration
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page page, struct vm_area_struct vma,
604	*/	605	*/
605	BUG_ON(!migration);	606	BUG_ON(!migration);
606	entry = make_migration_entry(page, pte_write(pteval));	607	entry = make_migration_entry(page, pte_write(pteval));
		608	#endif
607	}	609	}
608	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));	610	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
609	BUG_ON(pte_file(*pte));	611	BUG_ON(pte_file(*pte));
610	} else	612	} else
		613	#ifdef CONFIG_MIGRATION
		614	if (migration) {
		615	/* Establish migration entry for a file page */
		616	swp_entry_t entry;
		617	entry = make_migration_entry(page, pte_write(pteval));
		618	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
		619	} else
		620	#endif
611	dec_mm_counter(mm, file_rss);	621	dec_mm_counter(mm, file_rss);
612		622
		623
613	page_remove_rmap(page);	624	page_remove_rmap(page);
614	page_cache_release(page);	625	page_cache_release(page);
615		626


diff --git a/mm/vmscan.c b/mm/vmscan.c index bc5d4f43036c..71a02e295037 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
290	unlock_page(page);	290	unlock_page(page);
291	}	291	}
292		292
		293	/* possible outcome of pageout() */
		294	typedef enum {
		295	/* failed to write page out, page is locked */
		296	PAGE_KEEP,
		297	/* move page to the active list, page is locked */
		298	PAGE_ACTIVATE,
		299	/* page has been sent to the disk successfully, page is unlocked */
		300	PAGE_SUCCESS,
		301	/* page is clean and locked */
		302	PAGE_CLEAN,
		303	} pageout_t;
		304
293	/*	305	/*
294	* pageout is called by shrink_page_list() for each dirty page.	306	* pageout is called by shrink_page_list() for each dirty page.
295	* Calls ->writepage().	307	* Calls ->writepage().
296	*/	308	*/
297	pageout_t pageout(struct page page, struct address_space mapping)	309	static pageout_t pageout(struct page page, struct address_space mapping)
298	{	310	{
299	/*	311	/*
300	* If the page is dirty, only perform writeback if that write	312	* If the page is dirty, only perform writeback if that write