aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c213
1 files changed, 137 insertions, 76 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 44ea41196c13..987389a809e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -56,6 +57,7 @@
56#include <linux/swapops.h> 57#include <linux/swapops.h>
57#include <linux/elf.h> 58#include <linux/elf.h>
58 59
60#include <asm/io.h>
59#include <asm/pgalloc.h> 61#include <asm/pgalloc.h>
60#include <asm/uaccess.h> 62#include <asm/uaccess.h>
61#include <asm/tlb.h> 63#include <asm/tlb.h>
@@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s)
106} 108}
107__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
108 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
109 123
110/* 124/*
111 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -442,6 +456,20 @@ static inline int is_cow_mapping(unsigned int flags)
442 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 456 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
443} 457}
444 458
459#ifndef is_zero_pfn
460static inline int is_zero_pfn(unsigned long pfn)
461{
462 return pfn == zero_pfn;
463}
464#endif
465
466#ifndef my_zero_pfn
467static inline unsigned long my_zero_pfn(unsigned long addr)
468{
469 return zero_pfn;
470}
471#endif
472
445/* 473/*
446 * vm_normal_page -- This function gets the "struct page" associated with a pte. 474 * vm_normal_page -- This function gets the "struct page" associated with a pte.
447 * 475 *
@@ -497,7 +525,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
497 if (HAVE_PTE_SPECIAL) { 525 if (HAVE_PTE_SPECIAL) {
498 if (likely(!pte_special(pte))) 526 if (likely(!pte_special(pte)))
499 goto check_pfn; 527 goto check_pfn;
500 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 528 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
529 return NULL;
530 if (!is_zero_pfn(pfn))
501 print_bad_pte(vma, addr, pte, NULL); 531 print_bad_pte(vma, addr, pte, NULL);
502 return NULL; 532 return NULL;
503 } 533 }
@@ -519,6 +549,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
519 } 549 }
520 } 550 }
521 551
552 if (is_zero_pfn(pfn))
553 return NULL;
522check_pfn: 554check_pfn:
523 if (unlikely(pfn > highest_memmap_pfn)) { 555 if (unlikely(pfn > highest_memmap_pfn)) {
524 print_bad_pte(vma, addr, pte, NULL); 556 print_bad_pte(vma, addr, pte, NULL);
@@ -596,8 +628,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
596 page = vm_normal_page(vma, addr, pte); 628 page = vm_normal_page(vma, addr, pte);
597 if (page) { 629 if (page) {
598 get_page(page); 630 get_page(page);
599 page_dup_rmap(page, vma, addr); 631 page_dup_rmap(page);
600 rss[!!PageAnon(page)]++; 632 rss[PageAnon(page)]++;
601 } 633 }
602 634
603out_set_pte: 635out_set_pte:
@@ -1142,9 +1174,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1142 goto no_page; 1174 goto no_page;
1143 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1175 if ((flags & FOLL_WRITE) && !pte_write(pte))
1144 goto unlock; 1176 goto unlock;
1177
1145 page = vm_normal_page(vma, address, pte); 1178 page = vm_normal_page(vma, address, pte);
1146 if (unlikely(!page)) 1179 if (unlikely(!page)) {
1147 goto bad_page; 1180 if ((flags & FOLL_DUMP) ||
1181 !is_zero_pfn(pte_pfn(pte)))
1182 goto bad_page;
1183 page = pte_page(pte);
1184 }
1148 1185
1149 if (flags & FOLL_GET) 1186 if (flags & FOLL_GET)
1150 get_page(page); 1187 get_page(page);
@@ -1172,65 +1209,46 @@ no_page:
1172 pte_unmap_unlock(ptep, ptl); 1209 pte_unmap_unlock(ptep, ptl);
1173 if (!pte_none(pte)) 1210 if (!pte_none(pte))
1174 return page; 1211 return page;
1175 /* Fall through to ZERO_PAGE handling */ 1212
1176no_page_table: 1213no_page_table:
1177 /* 1214 /*
1178 * When core dumping an enormous anonymous area that nobody 1215 * When core dumping an enormous anonymous area that nobody
1179 * has touched so far, we don't want to allocate page tables. 1216 * has touched so far, we don't want to allocate unnecessary pages or
1217 * page tables. Return error instead of NULL to skip handle_mm_fault,
1218 * then get_dump_page() will return NULL to leave a hole in the dump.
1219 * But we can only make this optimization where a hole would surely
1220 * be zero-filled if handle_mm_fault() actually did handle it.
1180 */ 1221 */
1181 if (flags & FOLL_ANON) { 1222 if ((flags & FOLL_DUMP) &&
1182 page = ZERO_PAGE(0); 1223 (!vma->vm_ops || !vma->vm_ops->fault))
1183 if (flags & FOLL_GET) 1224 return ERR_PTR(-EFAULT);
1184 get_page(page);
1185 BUG_ON(flags & FOLL_WRITE);
1186 }
1187 return page; 1225 return page;
1188} 1226}
1189 1227
1190/* Can we do the FOLL_ANON optimization? */
1191static inline int use_zero_page(struct vm_area_struct *vma)
1192{
1193 /*
1194 * We don't want to optimize FOLL_ANON for make_pages_present()
1195 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1196 * we want to get the page from the page tables to make sure
1197 * that we serialize and update with any other user of that
1198 * mapping.
1199 */
1200 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1201 return 0;
1202 /*
1203 * And if we have a fault routine, it's not an anonymous region.
1204 */
1205 return !vma->vm_ops || !vma->vm_ops->fault;
1206}
1207
1208
1209
1210int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1228int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1211 unsigned long start, int nr_pages, int flags, 1229 unsigned long start, int nr_pages, unsigned int gup_flags,
1212 struct page **pages, struct vm_area_struct **vmas) 1230 struct page **pages, struct vm_area_struct **vmas)
1213{ 1231{
1214 int i; 1232 int i;
1215 unsigned int vm_flags = 0; 1233 unsigned long vm_flags;
1216 int write = !!(flags & GUP_FLAGS_WRITE);
1217 int force = !!(flags & GUP_FLAGS_FORCE);
1218 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1219 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1220 1234
1221 if (nr_pages <= 0) 1235 if (nr_pages <= 0)
1222 return 0; 1236 return 0;
1237
1238 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1239
1223 /* 1240 /*
1224 * Require read or write permissions. 1241 * Require read or write permissions.
1225 * If 'force' is set, we only require the "MAY" flags. 1242 * If FOLL_FORCE is set, we only require the "MAY" flags.
1226 */ 1243 */
1227 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1244 vm_flags = (gup_flags & FOLL_WRITE) ?
1228 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1245 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1246 vm_flags &= (gup_flags & FOLL_FORCE) ?
1247 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1229 i = 0; 1248 i = 0;
1230 1249
1231 do { 1250 do {
1232 struct vm_area_struct *vma; 1251 struct vm_area_struct *vma;
1233 unsigned int foll_flags;
1234 1252
1235 vma = find_extend_vma(mm, start); 1253 vma = find_extend_vma(mm, start);
1236 if (!vma && in_gate_area(tsk, start)) { 1254 if (!vma && in_gate_area(tsk, start)) {
@@ -1242,7 +1260,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1242 pte_t *pte; 1260 pte_t *pte;
1243 1261
1244 /* user gate pages are read-only */ 1262 /* user gate pages are read-only */
1245 if (!ignore && write) 1263 if (gup_flags & FOLL_WRITE)
1246 return i ? : -EFAULT; 1264 return i ? : -EFAULT;
1247 if (pg > TASK_SIZE) 1265 if (pg > TASK_SIZE)
1248 pgd = pgd_offset_k(pg); 1266 pgd = pgd_offset_k(pg);
@@ -1276,38 +1294,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1276 1294
1277 if (!vma || 1295 if (!vma ||
1278 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1296 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1279 (!ignore && !(vm_flags & vma->vm_flags))) 1297 !(vm_flags & vma->vm_flags))
1280 return i ? : -EFAULT; 1298 return i ? : -EFAULT;
1281 1299
1282 if (is_vm_hugetlb_page(vma)) { 1300 if (is_vm_hugetlb_page(vma)) {
1283 i = follow_hugetlb_page(mm, vma, pages, vmas, 1301 i = follow_hugetlb_page(mm, vma, pages, vmas,
1284 &start, &nr_pages, i, write); 1302 &start, &nr_pages, i, gup_flags);
1285 continue; 1303 continue;
1286 } 1304 }
1287 1305
1288 foll_flags = FOLL_TOUCH;
1289 if (pages)
1290 foll_flags |= FOLL_GET;
1291 if (!write && use_zero_page(vma))
1292 foll_flags |= FOLL_ANON;
1293
1294 do { 1306 do {
1295 struct page *page; 1307 struct page *page;
1308 unsigned int foll_flags = gup_flags;
1296 1309
1297 /* 1310 /*
1298 * If we have a pending SIGKILL, don't keep faulting 1311 * If we have a pending SIGKILL, don't keep faulting
1299 * pages and potentially allocating memory, unless 1312 * pages and potentially allocating memory.
1300 * current is handling munlock--e.g., on exit. In
1301 * that case, we are not allocating memory. Rather,
1302 * we're only unlocking already resident/mapped pages.
1303 */ 1313 */
1304 if (unlikely(!ignore_sigkill && 1314 if (unlikely(fatal_signal_pending(current)))
1305 fatal_signal_pending(current)))
1306 return i ? i : -ERESTARTSYS; 1315 return i ? i : -ERESTARTSYS;
1307 1316
1308 if (write)
1309 foll_flags |= FOLL_WRITE;
1310
1311 cond_resched(); 1317 cond_resched();
1312 while (!(page = follow_page(vma, start, foll_flags))) { 1318 while (!(page = follow_page(vma, start, foll_flags))) {
1313 int ret; 1319 int ret;
@@ -1419,18 +1425,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1419 unsigned long start, int nr_pages, int write, int force, 1425 unsigned long start, int nr_pages, int write, int force,
1420 struct page **pages, struct vm_area_struct **vmas) 1426 struct page **pages, struct vm_area_struct **vmas)
1421{ 1427{
1422 int flags = 0; 1428 int flags = FOLL_TOUCH;
1423 1429
1430 if (pages)
1431 flags |= FOLL_GET;
1424 if (write) 1432 if (write)
1425 flags |= GUP_FLAGS_WRITE; 1433 flags |= FOLL_WRITE;
1426 if (force) 1434 if (force)
1427 flags |= GUP_FLAGS_FORCE; 1435 flags |= FOLL_FORCE;
1428 1436
1429 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1437 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1430} 1438}
1431
1432EXPORT_SYMBOL(get_user_pages); 1439EXPORT_SYMBOL(get_user_pages);
1433 1440
1441/**
1442 * get_dump_page() - pin user page in memory while writing it to core dump
1443 * @addr: user address
1444 *
1445 * Returns struct page pointer of user page pinned for dump,
1446 * to be freed afterwards by page_cache_release() or put_page().
1447 *
1448 * Returns NULL on any kind of failure - a hole must then be inserted into
1449 * the corefile, to preserve alignment with its headers; and also returns
1450 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1451 * allowing a hole to be left in the corefile to save diskspace.
1452 *
1453 * Called without mmap_sem, but after all other threads have been killed.
1454 */
1455#ifdef CONFIG_ELF_CORE
1456struct page *get_dump_page(unsigned long addr)
1457{
1458 struct vm_area_struct *vma;
1459 struct page *page;
1460
1461 if (__get_user_pages(current, current->mm, addr, 1,
1462 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1463 return NULL;
1464 flush_cache_page(vma, addr, page_to_pfn(page));
1465 return page;
1466}
1467#endif /* CONFIG_ELF_CORE */
1468
1434pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1469pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1435 spinlock_t **ptl) 1470 spinlock_t **ptl)
1436{ 1471{
@@ -1608,7 +1643,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1608 * If we don't have pte special, then we have to use the pfn_valid() 1643 * If we don't have pte special, then we have to use the pfn_valid()
1609 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1644 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1610 * refcount the page if pfn_valid is true (hence insert_page rather 1645 * refcount the page if pfn_valid is true (hence insert_page rather
1611 * than insert_pfn). 1646 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1647 * without pte special, it would there be refcounted as a normal page.
1612 */ 1648 */
1613 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1649 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1614 struct page *page; 1650 struct page *page;
@@ -1974,7 +2010,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1974 * Take out anonymous pages first, anonymous shared vmas are 2010 * Take out anonymous pages first, anonymous shared vmas are
1975 * not dirty accountable. 2011 * not dirty accountable.
1976 */ 2012 */
1977 if (PageAnon(old_page)) { 2013 if (PageAnon(old_page) && !PageKsm(old_page)) {
1978 if (!trylock_page(old_page)) { 2014 if (!trylock_page(old_page)) {
1979 page_cache_get(old_page); 2015 page_cache_get(old_page);
1980 pte_unmap_unlock(page_table, ptl); 2016 pte_unmap_unlock(page_table, ptl);
@@ -2075,10 +2111,19 @@ gotten:
2075 2111
2076 if (unlikely(anon_vma_prepare(vma))) 2112 if (unlikely(anon_vma_prepare(vma)))
2077 goto oom; 2113 goto oom;
2078 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2114
2079 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2115 if (is_zero_pfn(pte_pfn(orig_pte))) {
2080 if (!new_page) 2116 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2081 goto oom; 2117 if (!new_page)
2118 goto oom;
2119 } else {
2120 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2121 if (!new_page)
2122 goto oom;
2123 cow_user_page(new_page, old_page, address, vma);
2124 }
2125 __SetPageUptodate(new_page);
2126
2082 /* 2127 /*
2083 * Don't let another task, with possibly unlocked vma, 2128 * Don't let another task, with possibly unlocked vma,
2084 * keep the mlocked page. 2129 * keep the mlocked page.
@@ -2088,8 +2133,6 @@ gotten:
2088 clear_page_mlock(old_page); 2133 clear_page_mlock(old_page);
2089 unlock_page(old_page); 2134 unlock_page(old_page);
2090 } 2135 }
2091 cow_user_page(new_page, old_page, address, vma);
2092 __SetPageUptodate(new_page);
2093 2136
2094 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2137 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2095 goto oom_free_new; 2138 goto oom_free_new;
@@ -2115,9 +2158,14 @@ gotten:
2115 * seen in the presence of one thread doing SMC and another 2158 * seen in the presence of one thread doing SMC and another
2116 * thread doing COW. 2159 * thread doing COW.
2117 */ 2160 */
2118 ptep_clear_flush_notify(vma, address, page_table); 2161 ptep_clear_flush(vma, address, page_table);
2119 page_add_new_anon_rmap(new_page, vma, address); 2162 page_add_new_anon_rmap(new_page, vma, address);
2120 set_pte_at(mm, address, page_table, entry); 2163 /*
2164 * We call the notify macro here because, when using secondary
2165 * mmu page tables (such as kvm shadow page tables), we want the
2166 * new page to be mapped directly into the secondary page table.
2167 */
2168 set_pte_at_notify(mm, address, page_table, entry);
2121 update_mmu_cache(vma, address, entry); 2169 update_mmu_cache(vma, address, entry);
2122 if (old_page) { 2170 if (old_page) {
2123 /* 2171 /*
@@ -2636,6 +2684,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2636 spinlock_t *ptl; 2684 spinlock_t *ptl;
2637 pte_t entry; 2685 pte_t entry;
2638 2686
2687 if (!(flags & FAULT_FLAG_WRITE)) {
2688 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2689 vma->vm_page_prot));
2690 ptl = pte_lockptr(mm, pmd);
2691 spin_lock(ptl);
2692 if (!pte_none(*page_table))
2693 goto unlock;
2694 goto setpte;
2695 }
2696
2639 /* Allocate our own private page. */ 2697 /* Allocate our own private page. */
2640 pte_unmap(page_table); 2698 pte_unmap(page_table);
2641 2699
@@ -2650,13 +2708,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2650 goto oom_free_page; 2708 goto oom_free_page;
2651 2709
2652 entry = mk_pte(page, vma->vm_page_prot); 2710 entry = mk_pte(page, vma->vm_page_prot);
2653 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2711 if (vma->vm_flags & VM_WRITE)
2712 entry = pte_mkwrite(pte_mkdirty(entry));
2654 2713
2655 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2714 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2656 if (!pte_none(*page_table)) 2715 if (!pte_none(*page_table))
2657 goto release; 2716 goto release;
2717
2658 inc_mm_counter(mm, anon_rss); 2718 inc_mm_counter(mm, anon_rss);
2659 page_add_new_anon_rmap(page, vma, address); 2719 page_add_new_anon_rmap(page, vma, address);
2720setpte:
2660 set_pte_at(mm, address, page_table, entry); 2721 set_pte_at(mm, address, page_table, entry);
2661 2722
2662 /* No need to invalidate - it was non-present before */ 2723 /* No need to invalidate - it was non-present before */