aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/swap.h7
-rw-r--r--include/linux/swapops.h53
-rw-r--r--mm/memory.c18
-rw-r--r--mm/migrate.c128
-rw-r--r--mm/mprotect.c23
-rw-r--r--mm/rmap.c38
-rw-r--r--mm/swapfile.c20
7 files changed, 255 insertions, 32 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index cd28ad206dae..7cee73ef4f15 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -28,7 +28,14 @@ static inline int current_is_kswapd(void)
28 * the type/offset into the pte as 5/27 as well. 28 * the type/offset into the pte as 5/27 as well.
29 */ 29 */
30#define MAX_SWAPFILES_SHIFT 5 30#define MAX_SWAPFILES_SHIFT 5
31#ifndef CONFIG_MIGRATION
31#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) 32#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT)
33#else
34/* Use last two entries for page migration swap entries */
35#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2)
36#define SWP_MIGRATION_READ MAX_SWAPFILES
37#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1)
38#endif
32 39
33/* 40/*
34 * Magic header for a swap area. The first part of the union is 41 * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 87b9d14c710d..ec639aa3a1d3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
67 BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); 67 BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
68 return __swp_entry_to_pte(arch_entry); 68 return __swp_entry_to_pte(arch_entry);
69} 69}
70
71#ifdef CONFIG_MIGRATION
72static inline swp_entry_t make_migration_entry(struct page *page, int write)
73{
74 BUG_ON(!PageLocked(page));
75 return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
76 page_to_pfn(page));
77}
78
79static inline int is_migration_entry(swp_entry_t entry)
80{
81 return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
82 swp_type(entry) == SWP_MIGRATION_WRITE);
83}
84
85static inline int is_write_migration_entry(swp_entry_t entry)
86{
87 return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
88}
89
90static inline struct page *migration_entry_to_page(swp_entry_t entry)
91{
92 struct page *p = pfn_to_page(swp_offset(entry));
93 /*
94 * Any use of migration entries may only occur while the
95 * corresponding page is locked
96 */
97 BUG_ON(!PageLocked(p));
98 return p;
99}
100
101static inline void make_migration_entry_read(swp_entry_t *entry)
102{
103 *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
104}
105
106extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
107 unsigned long address);
108#else
109
110#define make_migration_entry(page, write) swp_entry(0, 0)
111#define is_migration_entry(swp) 0
112#define migration_entry_to_page(swp) NULL
113static inline void make_migration_entry_read(swp_entry_t *entryp) { }
114static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
115 unsigned long address) { }
116static inline int is_write_migration_entry(swp_entry_t entry)
117{
118 return 0;
119}
120
121#endif
122
diff --git a/mm/memory.c b/mm/memory.c
index 7e3683fd4f3c..11673c5d2c20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
434 /* pte contains position in swap or file, so copy. */ 434 /* pte contains position in swap or file, so copy. */
435 if (unlikely(!pte_present(pte))) { 435 if (unlikely(!pte_present(pte))) {
436 if (!pte_file(pte)) { 436 if (!pte_file(pte)) {
437 swap_duplicate(pte_to_swp_entry(pte)); 437 swp_entry_t entry = pte_to_swp_entry(pte);
438
439 swap_duplicate(entry);
438 /* make sure dst_mm is on swapoff's mmlist. */ 440 /* make sure dst_mm is on swapoff's mmlist. */
439 if (unlikely(list_empty(&dst_mm->mmlist))) { 441 if (unlikely(list_empty(&dst_mm->mmlist))) {
440 spin_lock(&mmlist_lock); 442 spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
443 &src_mm->mmlist); 445 &src_mm->mmlist);
444 spin_unlock(&mmlist_lock); 446 spin_unlock(&mmlist_lock);
445 } 447 }
448 if (is_write_migration_entry(entry) &&
449 is_cow_mapping(vm_flags)) {
450 /*
451 * COW mappings require pages in both parent
452 * and child to be set to read.
453 */
454 make_migration_entry_read(&entry);
455 pte = swp_entry_to_pte(entry);
456 set_pte_at(src_mm, addr, src_pte, pte);
457 }
446 } 458 }
447 goto out_set_pte; 459 goto out_set_pte;
448 } 460 }
@@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1879 goto out; 1891 goto out;
1880 1892
1881 entry = pte_to_swp_entry(orig_pte); 1893 entry = pte_to_swp_entry(orig_pte);
1894 if (is_migration_entry(entry)) {
1895 migration_entry_wait(mm, pmd, address);
1896 goto out;
1897 }
1882 page = lookup_swap_cache(entry); 1898 page = lookup_swap_cache(entry);
1883 if (!page) { 1899 if (!page) {
1884 swapin_readahead(entry, address, vma); 1900 swapin_readahead(entry, address, vma);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5a340f4ca212..0a011e421bb4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
15#include <linux/migrate.h> 15#include <linux/migrate.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/swapops.h>
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
20#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
@@ -23,7 +24,6 @@
23#include <linux/topology.h> 24#include <linux/topology.h>
24#include <linux/cpu.h> 25#include <linux/cpu.h>
25#include <linux/cpuset.h> 26#include <linux/cpuset.h>
26#include <linux/swapops.h>
27 27
28#include "internal.h" 28#include "internal.h"
29 29
@@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l)
119 return count; 119 return count;
120} 120}
121 121
122static inline int is_swap_pte(pte_t pte)
123{
124 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
125}
126
127/*
128 * Restore a potential migration pte to a working pte entry
129 */
130static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
131 struct page *old, struct page *new)
132{
133 struct mm_struct *mm = vma->vm_mm;
134 swp_entry_t entry;
135 pgd_t *pgd;
136 pud_t *pud;
137 pmd_t *pmd;
138 pte_t *ptep, pte;
139 spinlock_t *ptl;
140
141 pgd = pgd_offset(mm, addr);
142 if (!pgd_present(*pgd))
143 return;
144
145 pud = pud_offset(pgd, addr);
146 if (!pud_present(*pud))
147 return;
148
149 pmd = pmd_offset(pud, addr);
150 if (!pmd_present(*pmd))
151 return;
152
153 ptep = pte_offset_map(pmd, addr);
154
155 if (!is_swap_pte(*ptep)) {
156 pte_unmap(ptep);
157 return;
158 }
159
160 ptl = pte_lockptr(mm, pmd);
161 spin_lock(ptl);
162 pte = *ptep;
163 if (!is_swap_pte(pte))
164 goto out;
165
166 entry = pte_to_swp_entry(pte);
167
168 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
169 goto out;
170
171 inc_mm_counter(mm, anon_rss);
172 get_page(new);
173 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
174 if (is_write_migration_entry(entry))
175 pte = pte_mkwrite(pte);
176 set_pte_at(mm, addr, ptep, pte);
177 page_add_anon_rmap(new, vma, addr);
178out:
179 pte_unmap_unlock(ptep, ptl);
180}
181
182/*
183 * Get rid of all migration entries and replace them by
184 * references to the indicated page.
185 *
186 * Must hold mmap_sem lock on at least one of the vmas containing
187 * the page so that the anon_vma cannot vanish.
188 */
189static void remove_migration_ptes(struct page *old, struct page *new)
190{
191 struct anon_vma *anon_vma;
192 struct vm_area_struct *vma;
193 unsigned long mapping;
194
195 mapping = (unsigned long)new->mapping;
196
197 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
198 return;
199
200 /*
201 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
202 */
203 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
204 spin_lock(&anon_vma->lock);
205
206 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
207 remove_migration_pte(vma, page_address_in_vma(new, vma),
208 old, new);
209
210 spin_unlock(&anon_vma->lock);
211}
212
213/*
214 * Something used the pte of a page under migration. We need to
215 * get to the page and wait until migration is finished.
216 * When we return from this function the fault will be retried.
217 *
218 * This function is called from do_swap_page().
219 */
220void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
221 unsigned long address)
222{
223 pte_t *ptep, pte;
224 spinlock_t *ptl;
225 swp_entry_t entry;
226 struct page *page;
227
228 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
229 pte = *ptep;
230 if (!is_swap_pte(pte))
231 goto out;
232
233 entry = pte_to_swp_entry(pte);
234 if (!is_migration_entry(entry))
235 goto out;
236
237 page = migration_entry_to_page(entry);
238
239 get_page(page);
240 pte_unmap_unlock(ptep, ptl);
241 wait_on_page_locked(page);
242 put_page(page);
243 return;
244out:
245 pte_unmap_unlock(ptep, ptl);
246}
247
122/* 248/*
123 * swapout a single page 249 * swapout a single page
124 * page is locked upon entry, unlocked on exit 250 * page is locked upon entry, unlocked on exit
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5faf01ad3ef8..14f93e62270f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
19#include <linux/mempolicy.h> 19#include <linux/mempolicy.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22 22#include <linux/swap.h>
23#include <linux/swapops.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
28static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 29static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
29 unsigned long addr, unsigned long end, pgprot_t newprot) 30 unsigned long addr, unsigned long end, pgprot_t newprot)
30{ 31{
31 pte_t *pte; 32 pte_t *pte, oldpte;
32 spinlock_t *ptl; 33 spinlock_t *ptl;
33 34
34 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 35 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
35 do { 36 do {
36 if (pte_present(*pte)) { 37 oldpte = *pte;
38 if (pte_present(oldpte)) {
37 pte_t ptent; 39 pte_t ptent;
38 40
39 /* Avoid an SMP race with hardware updated dirty/clean 41 /* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
43 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); 45 ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
44 set_pte_at(mm, addr, pte, ptent); 46 set_pte_at(mm, addr, pte, ptent);
45 lazy_mmu_prot_update(ptent); 47 lazy_mmu_prot_update(ptent);
48#ifdef CONFIG_MIGRATION
49 } else if (!pte_file(oldpte)) {
50 swp_entry_t entry = pte_to_swp_entry(oldpte);
51
52 if (is_write_migration_entry(entry)) {
53 /*
54 * A protection check is difficult so
55 * just be safe and disable write
56 */
57 make_migration_entry_read(&entry);
58 set_pte_at(mm, addr, pte,
59 swp_entry_to_pte(entry));
60 }
61#endif
46 } 62 }
63
47 } while (pte++, addr += PAGE_SIZE, addr != end); 64 } while (pte++, addr += PAGE_SIZE, addr != end);
48 pte_unmap_unlock(pte - 1, ptl); 65 pte_unmap_unlock(pte - 1, ptl);
49} 66}
diff --git a/mm/rmap.c b/mm/rmap.c
index 10806b7af40c..3b8ce86daa3a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
103 spin_lock(&mm->page_table_lock); 103 spin_lock(&mm->page_table_lock);
104 if (likely(!vma->anon_vma)) { 104 if (likely(!vma->anon_vma)) {
105 vma->anon_vma = anon_vma; 105 vma->anon_vma = anon_vma;
106 list_add(&vma->anon_vma_node, &anon_vma->head); 106 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
107 allocated = NULL; 107 allocated = NULL;
108 } 108 }
109 spin_unlock(&mm->page_table_lock); 109 spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
127 struct anon_vma *anon_vma = vma->anon_vma; 127 struct anon_vma *anon_vma = vma->anon_vma;
128 128
129 if (anon_vma) { 129 if (anon_vma) {
130 list_add(&vma->anon_vma_node, &anon_vma->head); 130 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
131 validate_anon_vma(vma); 131 validate_anon_vma(vma);
132 } 132 }
133} 133}
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
138 138
139 if (anon_vma) { 139 if (anon_vma) {
140 spin_lock(&anon_vma->lock); 140 spin_lock(&anon_vma->lock);
141 list_add(&vma->anon_vma_node, &anon_vma->head); 141 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
142 validate_anon_vma(vma); 142 validate_anon_vma(vma);
143 spin_unlock(&anon_vma->lock); 143 spin_unlock(&anon_vma->lock);
144 } 144 }
@@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
620 620
621 if (PageAnon(page)) { 621 if (PageAnon(page)) {
622 swp_entry_t entry = { .val = page_private(page) }; 622 swp_entry_t entry = { .val = page_private(page) };
623 /* 623
624 * Store the swap location in the pte. 624 if (PageSwapCache(page)) {
625 * See handle_pte_fault() ... 625 /*
626 */ 626 * Store the swap location in the pte.
627 BUG_ON(!PageSwapCache(page)); 627 * See handle_pte_fault() ...
628 swap_duplicate(entry); 628 */
629 if (list_empty(&mm->mmlist)) { 629 swap_duplicate(entry);
630 spin_lock(&mmlist_lock); 630 if (list_empty(&mm->mmlist)) {
631 if (list_empty(&mm->mmlist)) 631 spin_lock(&mmlist_lock);
632 list_add(&mm->mmlist, &init_mm.mmlist); 632 if (list_empty(&mm->mmlist))
633 spin_unlock(&mmlist_lock); 633 list_add(&mm->mmlist, &init_mm.mmlist);
634 spin_unlock(&mmlist_lock);
635 }
636 } else {
637 /*
638 * Store the pfn of the page in a special migration
639 * pte. do_swap_page() will wait until the migration
640 * pte is removed and then restart fault handling.
641 */
642 BUG_ON(!migration);
643 entry = make_migration_entry(page, pte_write(pteval));
634 } 644 }
635 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 645 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
636 BUG_ON(pte_file(*pte)); 646 BUG_ON(pte_file(*pte));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 47a6812f5f8c..e3b1362372c2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
395 struct swap_info_struct * p; 395 struct swap_info_struct * p;
396 struct page *page = NULL; 396 struct page *page = NULL;
397 397
398 if (is_migration_entry(entry))
399 return;
400
398 p = swap_info_get(entry); 401 p = swap_info_get(entry);
399 if (p) { 402 if (p) {
400 if (swap_entry_free(p, swp_offset(entry)) == 1) { 403 if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1400 if (!(p->flags & SWP_USED)) 1403 if (!(p->flags & SWP_USED))
1401 break; 1404 break;
1402 error = -EPERM; 1405 error = -EPERM;
1403 /* 1406 if (type >= MAX_SWAPFILES) {
1404 * Test if adding another swap device is possible. There are
1405 * two limiting factors: 1) the number of bits for the swap
1406 * type swp_entry_t definition and 2) the number of bits for
1407 * the swap type in the swap ptes as defined by the different
1408 * architectures. To honor both limitations a swap entry
1409 * with swap offset 0 and swap type ~0UL is created, encoded
1410 * to a swap pte, decoded to a swp_entry_t again and finally
1411 * the swap type part is extracted. This will mask all bits
1412 * from the initial ~0UL that can't be encoded in either the
1413 * swp_entry_t or the architecture definition of a swap pte.
1414 */
1415 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
1416 spin_unlock(&swap_lock); 1407 spin_unlock(&swap_lock);
1417 goto out; 1408 goto out;
1418 } 1409 }
@@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry)
1702 unsigned long offset, type; 1693 unsigned long offset, type;
1703 int result = 0; 1694 int result = 0;
1704 1695
1696 if (is_migration_entry(entry))
1697 return 1;
1698
1705 type = swp_type(entry); 1699 type = swp_type(entry);
1706 if (type >= nr_swapfiles) 1700 if (type >= nr_swapfiles)
1707 goto bad_file; 1701 goto bad_file;