aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c212
1 files changed, 136 insertions, 76 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e8f63d9961ea..b1443ac07c00 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -107,6 +108,18 @@ static int __init disable_randmaps(char *s)
107} 108}
108__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
109 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
110 123
111/* 124/*
112 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -443,6 +456,20 @@ static inline int is_cow_mapping(unsigned int flags)
443 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 456 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
444} 457}
445 458
459#ifndef is_zero_pfn
460static inline int is_zero_pfn(unsigned long pfn)
461{
462 return pfn == zero_pfn;
463}
464#endif
465
466#ifndef my_zero_pfn
467static inline unsigned long my_zero_pfn(unsigned long addr)
468{
469 return zero_pfn;
470}
471#endif
472
446/* 473/*
447 * vm_normal_page -- This function gets the "struct page" associated with a pte. 474 * vm_normal_page -- This function gets the "struct page" associated with a pte.
448 * 475 *
@@ -498,7 +525,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
498 if (HAVE_PTE_SPECIAL) { 525 if (HAVE_PTE_SPECIAL) {
499 if (likely(!pte_special(pte))) 526 if (likely(!pte_special(pte)))
500 goto check_pfn; 527 goto check_pfn;
501 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 528 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
529 return NULL;
530 if (!is_zero_pfn(pfn))
502 print_bad_pte(vma, addr, pte, NULL); 531 print_bad_pte(vma, addr, pte, NULL);
503 return NULL; 532 return NULL;
504 } 533 }
@@ -520,6 +549,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
520 } 549 }
521 } 550 }
522 551
552 if (is_zero_pfn(pfn))
553 return NULL;
523check_pfn: 554check_pfn:
524 if (unlikely(pfn > highest_memmap_pfn)) { 555 if (unlikely(pfn > highest_memmap_pfn)) {
525 print_bad_pte(vma, addr, pte, NULL); 556 print_bad_pte(vma, addr, pte, NULL);
@@ -597,8 +628,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
597 page = vm_normal_page(vma, addr, pte); 628 page = vm_normal_page(vma, addr, pte);
598 if (page) { 629 if (page) {
599 get_page(page); 630 get_page(page);
600 page_dup_rmap(page, vma, addr); 631 page_dup_rmap(page);
601 rss[!!PageAnon(page)]++; 632 rss[PageAnon(page)]++;
602 } 633 }
603 634
604out_set_pte: 635out_set_pte:
@@ -1143,9 +1174,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1143 goto no_page; 1174 goto no_page;
1144 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1175 if ((flags & FOLL_WRITE) && !pte_write(pte))
1145 goto unlock; 1176 goto unlock;
1177
1146 page = vm_normal_page(vma, address, pte); 1178 page = vm_normal_page(vma, address, pte);
1147 if (unlikely(!page)) 1179 if (unlikely(!page)) {
1148 goto bad_page; 1180 if ((flags & FOLL_DUMP) ||
1181 !is_zero_pfn(pte_pfn(pte)))
1182 goto bad_page;
1183 page = pte_page(pte);
1184 }
1149 1185
1150 if (flags & FOLL_GET) 1186 if (flags & FOLL_GET)
1151 get_page(page); 1187 get_page(page);
@@ -1173,65 +1209,46 @@ no_page:
1173 pte_unmap_unlock(ptep, ptl); 1209 pte_unmap_unlock(ptep, ptl);
1174 if (!pte_none(pte)) 1210 if (!pte_none(pte))
1175 return page; 1211 return page;
1176 /* Fall through to ZERO_PAGE handling */ 1212
1177no_page_table: 1213no_page_table:
1178 /* 1214 /*
1179 * When core dumping an enormous anonymous area that nobody 1215 * When core dumping an enormous anonymous area that nobody
1180 * has touched so far, we don't want to allocate page tables. 1216 * has touched so far, we don't want to allocate unnecessary pages or
1217 * page tables. Return error instead of NULL to skip handle_mm_fault,
1218 * then get_dump_page() will return NULL to leave a hole in the dump.
1219 * But we can only make this optimization where a hole would surely
1220 * be zero-filled if handle_mm_fault() actually did handle it.
1181 */ 1221 */
1182 if (flags & FOLL_ANON) { 1222 if ((flags & FOLL_DUMP) &&
1183 page = ZERO_PAGE(0); 1223 (!vma->vm_ops || !vma->vm_ops->fault))
1184 if (flags & FOLL_GET) 1224 return ERR_PTR(-EFAULT);
1185 get_page(page);
1186 BUG_ON(flags & FOLL_WRITE);
1187 }
1188 return page; 1225 return page;
1189} 1226}
1190 1227
1191/* Can we do the FOLL_ANON optimization? */
1192static inline int use_zero_page(struct vm_area_struct *vma)
1193{
1194 /*
1195 * We don't want to optimize FOLL_ANON for make_pages_present()
1196 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1197 * we want to get the page from the page tables to make sure
1198 * that we serialize and update with any other user of that
1199 * mapping.
1200 */
1201 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1202 return 0;
1203 /*
1204 * And if we have a fault routine, it's not an anonymous region.
1205 */
1206 return !vma->vm_ops || !vma->vm_ops->fault;
1207}
1208
1209
1210
1211int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1228int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1212 unsigned long start, int nr_pages, int flags, 1229 unsigned long start, int nr_pages, unsigned int gup_flags,
1213 struct page **pages, struct vm_area_struct **vmas) 1230 struct page **pages, struct vm_area_struct **vmas)
1214{ 1231{
1215 int i; 1232 int i;
1216 unsigned int vm_flags = 0; 1233 unsigned long vm_flags;
1217 int write = !!(flags & GUP_FLAGS_WRITE);
1218 int force = !!(flags & GUP_FLAGS_FORCE);
1219 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1220 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1221 1234
1222 if (nr_pages <= 0) 1235 if (nr_pages <= 0)
1223 return 0; 1236 return 0;
1237
1238 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1239
1224 /* 1240 /*
1225 * Require read or write permissions. 1241 * Require read or write permissions.
1226 * If 'force' is set, we only require the "MAY" flags. 1242 * If FOLL_FORCE is set, we only require the "MAY" flags.
1227 */ 1243 */
1228 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1244 vm_flags = (gup_flags & FOLL_WRITE) ?
1229 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1245 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1246 vm_flags &= (gup_flags & FOLL_FORCE) ?
1247 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1230 i = 0; 1248 i = 0;
1231 1249
1232 do { 1250 do {
1233 struct vm_area_struct *vma; 1251 struct vm_area_struct *vma;
1234 unsigned int foll_flags;
1235 1252
1236 vma = find_extend_vma(mm, start); 1253 vma = find_extend_vma(mm, start);
1237 if (!vma && in_gate_area(tsk, start)) { 1254 if (!vma && in_gate_area(tsk, start)) {
@@ -1243,7 +1260,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1243 pte_t *pte; 1260 pte_t *pte;
1244 1261
1245 /* user gate pages are read-only */ 1262 /* user gate pages are read-only */
1246 if (!ignore && write) 1263 if (gup_flags & FOLL_WRITE)
1247 return i ? : -EFAULT; 1264 return i ? : -EFAULT;
1248 if (pg > TASK_SIZE) 1265 if (pg > TASK_SIZE)
1249 pgd = pgd_offset_k(pg); 1266 pgd = pgd_offset_k(pg);
@@ -1277,38 +1294,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1277 1294
1278 if (!vma || 1295 if (!vma ||
1279 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1296 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1280 (!ignore && !(vm_flags & vma->vm_flags))) 1297 !(vm_flags & vma->vm_flags))
1281 return i ? : -EFAULT; 1298 return i ? : -EFAULT;
1282 1299
1283 if (is_vm_hugetlb_page(vma)) { 1300 if (is_vm_hugetlb_page(vma)) {
1284 i = follow_hugetlb_page(mm, vma, pages, vmas, 1301 i = follow_hugetlb_page(mm, vma, pages, vmas,
1285 &start, &nr_pages, i, write); 1302 &start, &nr_pages, i, gup_flags);
1286 continue; 1303 continue;
1287 } 1304 }
1288 1305
1289 foll_flags = FOLL_TOUCH;
1290 if (pages)
1291 foll_flags |= FOLL_GET;
1292 if (!write && use_zero_page(vma))
1293 foll_flags |= FOLL_ANON;
1294
1295 do { 1306 do {
1296 struct page *page; 1307 struct page *page;
1308 unsigned int foll_flags = gup_flags;
1297 1309
1298 /* 1310 /*
1299 * If we have a pending SIGKILL, don't keep faulting 1311 * If we have a pending SIGKILL, don't keep faulting
1300 * pages and potentially allocating memory, unless 1312 * pages and potentially allocating memory.
1301 * current is handling munlock--e.g., on exit. In
1302 * that case, we are not allocating memory. Rather,
1303 * we're only unlocking already resident/mapped pages.
1304 */ 1313 */
1305 if (unlikely(!ignore_sigkill && 1314 if (unlikely(fatal_signal_pending(current)))
1306 fatal_signal_pending(current)))
1307 return i ? i : -ERESTARTSYS; 1315 return i ? i : -ERESTARTSYS;
1308 1316
1309 if (write)
1310 foll_flags |= FOLL_WRITE;
1311
1312 cond_resched(); 1317 cond_resched();
1313 while (!(page = follow_page(vma, start, foll_flags))) { 1318 while (!(page = follow_page(vma, start, foll_flags))) {
1314 int ret; 1319 int ret;
@@ -1419,18 +1424,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1419 unsigned long start, int nr_pages, int write, int force, 1424 unsigned long start, int nr_pages, int write, int force,
1420 struct page **pages, struct vm_area_struct **vmas) 1425 struct page **pages, struct vm_area_struct **vmas)
1421{ 1426{
1422 int flags = 0; 1427 int flags = FOLL_TOUCH;
1423 1428
1429 if (pages)
1430 flags |= FOLL_GET;
1424 if (write) 1431 if (write)
1425 flags |= GUP_FLAGS_WRITE; 1432 flags |= FOLL_WRITE;
1426 if (force) 1433 if (force)
1427 flags |= GUP_FLAGS_FORCE; 1434 flags |= FOLL_FORCE;
1428 1435
1429 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1436 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1430} 1437}
1431
1432EXPORT_SYMBOL(get_user_pages); 1438EXPORT_SYMBOL(get_user_pages);
1433 1439
1440/**
1441 * get_dump_page() - pin user page in memory while writing it to core dump
1442 * @addr: user address
1443 *
1444 * Returns struct page pointer of user page pinned for dump,
1445 * to be freed afterwards by page_cache_release() or put_page().
1446 *
1447 * Returns NULL on any kind of failure - a hole must then be inserted into
1448 * the corefile, to preserve alignment with its headers; and also returns
1449 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1450 * allowing a hole to be left in the corefile to save diskspace.
1451 *
1452 * Called without mmap_sem, but after all other threads have been killed.
1453 */
1454#ifdef CONFIG_ELF_CORE
1455struct page *get_dump_page(unsigned long addr)
1456{
1457 struct vm_area_struct *vma;
1458 struct page *page;
1459
1460 if (__get_user_pages(current, current->mm, addr, 1,
1461 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1462 return NULL;
1463 flush_cache_page(vma, addr, page_to_pfn(page));
1464 return page;
1465}
1466#endif /* CONFIG_ELF_CORE */
1467
1434pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1468pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1435 spinlock_t **ptl) 1469 spinlock_t **ptl)
1436{ 1470{
@@ -1608,7 +1642,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1608 * If we don't have pte special, then we have to use the pfn_valid() 1642 * If we don't have pte special, then we have to use the pfn_valid()
1609 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1643 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1610 * refcount the page if pfn_valid is true (hence insert_page rather 1644 * refcount the page if pfn_valid is true (hence insert_page rather
1611 * than insert_pfn). 1645 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1646 * without pte special, it would there be refcounted as a normal page.
1612 */ 1647 */
1613 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1648 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1614 struct page *page; 1649 struct page *page;
@@ -1974,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1974 * Take out anonymous pages first, anonymous shared vmas are 2009 * Take out anonymous pages first, anonymous shared vmas are
1975 * not dirty accountable. 2010 * not dirty accountable.
1976 */ 2011 */
1977 if (PageAnon(old_page)) { 2012 if (PageAnon(old_page) && !PageKsm(old_page)) {
1978 if (!trylock_page(old_page)) { 2013 if (!trylock_page(old_page)) {
1979 page_cache_get(old_page); 2014 page_cache_get(old_page);
1980 pte_unmap_unlock(page_table, ptl); 2015 pte_unmap_unlock(page_table, ptl);
@@ -2075,10 +2110,19 @@ gotten:
2075 2110
2076 if (unlikely(anon_vma_prepare(vma))) 2111 if (unlikely(anon_vma_prepare(vma)))
2077 goto oom; 2112 goto oom;
2078 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2113
2079 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2114 if (is_zero_pfn(pte_pfn(orig_pte))) {
2080 if (!new_page) 2115 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2081 goto oom; 2116 if (!new_page)
2117 goto oom;
2118 } else {
2119 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2120 if (!new_page)
2121 goto oom;
2122 cow_user_page(new_page, old_page, address, vma);
2123 }
2124 __SetPageUptodate(new_page);
2125
2082 /* 2126 /*
2083 * Don't let another task, with possibly unlocked vma, 2127 * Don't let another task, with possibly unlocked vma,
2084 * keep the mlocked page. 2128 * keep the mlocked page.
@@ -2088,8 +2132,6 @@ gotten:
2088 clear_page_mlock(old_page); 2132 clear_page_mlock(old_page);
2089 unlock_page(old_page); 2133 unlock_page(old_page);
2090 } 2134 }
2091 cow_user_page(new_page, old_page, address, vma);
2092 __SetPageUptodate(new_page);
2093 2135
2094 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2136 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2095 goto oom_free_new; 2137 goto oom_free_new;
@@ -2115,9 +2157,14 @@ gotten:
2115 * seen in the presence of one thread doing SMC and another 2157 * seen in the presence of one thread doing SMC and another
2116 * thread doing COW. 2158 * thread doing COW.
2117 */ 2159 */
2118 ptep_clear_flush_notify(vma, address, page_table); 2160 ptep_clear_flush(vma, address, page_table);
2119 page_add_new_anon_rmap(new_page, vma, address); 2161 page_add_new_anon_rmap(new_page, vma, address);
2120 set_pte_at(mm, address, page_table, entry); 2162 /*
2163 * We call the notify macro here because, when using secondary
2164 * mmu page tables (such as kvm shadow page tables), we want the
2165 * new page to be mapped directly into the secondary page table.
2166 */
2167 set_pte_at_notify(mm, address, page_table, entry);
2121 update_mmu_cache(vma, address, entry); 2168 update_mmu_cache(vma, address, entry);
2122 if (old_page) { 2169 if (old_page) {
2123 /* 2170 /*
@@ -2625,6 +2672,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2625 spinlock_t *ptl; 2672 spinlock_t *ptl;
2626 pte_t entry; 2673 pte_t entry;
2627 2674
2675 if (!(flags & FAULT_FLAG_WRITE)) {
2676 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2677 vma->vm_page_prot));
2678 ptl = pte_lockptr(mm, pmd);
2679 spin_lock(ptl);
2680 if (!pte_none(*page_table))
2681 goto unlock;
2682 goto setpte;
2683 }
2684
2628 /* Allocate our own private page. */ 2685 /* Allocate our own private page. */
2629 pte_unmap(page_table); 2686 pte_unmap(page_table);
2630 2687
@@ -2639,13 +2696,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2639 goto oom_free_page; 2696 goto oom_free_page;
2640 2697
2641 entry = mk_pte(page, vma->vm_page_prot); 2698 entry = mk_pte(page, vma->vm_page_prot);
2642 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2699 if (vma->vm_flags & VM_WRITE)
2700 entry = pte_mkwrite(pte_mkdirty(entry));
2643 2701
2644 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2702 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2645 if (!pte_none(*page_table)) 2703 if (!pte_none(*page_table))
2646 goto release; 2704 goto release;
2705
2647 inc_mm_counter(mm, anon_rss); 2706 inc_mm_counter(mm, anon_rss);
2648 page_add_new_anon_rmap(page, vma, address); 2707 page_add_new_anon_rmap(page, vma, address);
2708setpte:
2649 set_pte_at(mm, address, page_table, entry); 2709 set_pte_at(mm, address, page_table, entry);
2650 2710
2651 /* No need to invalidate - it was non-present before */ 2711 /* No need to invalidate - it was non-present before */