diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 311 |
1 files changed, 169 insertions, 142 deletions
diff --git a/mm/memory.c b/mm/memory.c index aede2ce3aba4..6ab19dd4a199 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/swap.h> | 45 | #include <linux/swap.h> |
46 | #include <linux/highmem.h> | 46 | #include <linux/highmem.h> |
47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> |
48 | #include <linux/ksm.h> | ||
48 | #include <linux/rmap.h> | 49 | #include <linux/rmap.h> |
49 | #include <linux/module.h> | 50 | #include <linux/module.h> |
50 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
@@ -56,6 +57,7 @@ | |||
56 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
57 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
58 | 59 | ||
60 | #include <asm/io.h> | ||
59 | #include <asm/pgalloc.h> | 61 | #include <asm/pgalloc.h> |
60 | #include <asm/uaccess.h> | 62 | #include <asm/uaccess.h> |
61 | #include <asm/tlb.h> | 63 | #include <asm/tlb.h> |
@@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s) | |||
106 | } | 108 | } |
107 | __setup("norandmaps", disable_randmaps); | 109 | __setup("norandmaps", disable_randmaps); |
108 | 110 | ||
111 | unsigned long zero_pfn __read_mostly; | ||
112 | unsigned long highest_memmap_pfn __read_mostly; | ||
113 | |||
114 | /* | ||
115 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() | ||
116 | */ | ||
117 | static int __init init_zero_pfn(void) | ||
118 | { | ||
119 | zero_pfn = page_to_pfn(ZERO_PAGE(0)); | ||
120 | return 0; | ||
121 | } | ||
122 | core_initcall(init_zero_pfn); | ||
109 | 123 | ||
110 | /* | 124 | /* |
111 | * If a p?d_bad entry is found while walking page tables, report | 125 | * If a p?d_bad entry is found while walking page tables, report |
@@ -283,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
283 | unsigned long addr = vma->vm_start; | 297 | unsigned long addr = vma->vm_start; |
284 | 298 | ||
285 | /* | 299 | /* |
286 | * Hide vma from rmap and vmtruncate before freeing pgtables | 300 | * Hide vma from rmap and truncate_pagecache before freeing |
301 | * pgtables | ||
287 | */ | 302 | */ |
288 | anon_vma_unlink(vma); | 303 | anon_vma_unlink(vma); |
289 | unlink_file_vma(vma); | 304 | unlink_file_vma(vma); |
@@ -442,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags) | |||
442 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 457 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
443 | } | 458 | } |
444 | 459 | ||
460 | #ifndef is_zero_pfn | ||
461 | static inline int is_zero_pfn(unsigned long pfn) | ||
462 | { | ||
463 | return pfn == zero_pfn; | ||
464 | } | ||
465 | #endif | ||
466 | |||
467 | #ifndef my_zero_pfn | ||
468 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
469 | { | ||
470 | return zero_pfn; | ||
471 | } | ||
472 | #endif | ||
473 | |||
445 | /* | 474 | /* |
446 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 475 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
447 | * | 476 | * |
@@ -497,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
497 | if (HAVE_PTE_SPECIAL) { | 526 | if (HAVE_PTE_SPECIAL) { |
498 | if (likely(!pte_special(pte))) | 527 | if (likely(!pte_special(pte))) |
499 | goto check_pfn; | 528 | goto check_pfn; |
500 | if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) | 529 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
530 | return NULL; | ||
531 | if (!is_zero_pfn(pfn)) | ||
501 | print_bad_pte(vma, addr, pte, NULL); | 532 | print_bad_pte(vma, addr, pte, NULL); |
502 | return NULL; | 533 | return NULL; |
503 | } | 534 | } |
@@ -519,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
519 | } | 550 | } |
520 | } | 551 | } |
521 | 552 | ||
553 | if (is_zero_pfn(pfn)) | ||
554 | return NULL; | ||
522 | check_pfn: | 555 | check_pfn: |
523 | if (unlikely(pfn > highest_memmap_pfn)) { | 556 | if (unlikely(pfn > highest_memmap_pfn)) { |
524 | print_bad_pte(vma, addr, pte, NULL); | 557 | print_bad_pte(vma, addr, pte, NULL); |
@@ -596,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
596 | page = vm_normal_page(vma, addr, pte); | 629 | page = vm_normal_page(vma, addr, pte); |
597 | if (page) { | 630 | if (page) { |
598 | get_page(page); | 631 | get_page(page); |
599 | page_dup_rmap(page, vma, addr); | 632 | page_dup_rmap(page); |
600 | rss[!!PageAnon(page)]++; | 633 | rss[PageAnon(page)]++; |
601 | } | 634 | } |
602 | 635 | ||
603 | out_set_pte: | 636 | out_set_pte: |
@@ -608,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
608 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 641 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
609 | unsigned long addr, unsigned long end) | 642 | unsigned long addr, unsigned long end) |
610 | { | 643 | { |
644 | pte_t *orig_src_pte, *orig_dst_pte; | ||
611 | pte_t *src_pte, *dst_pte; | 645 | pte_t *src_pte, *dst_pte; |
612 | spinlock_t *src_ptl, *dst_ptl; | 646 | spinlock_t *src_ptl, *dst_ptl; |
613 | int progress = 0; | 647 | int progress = 0; |
@@ -621,6 +655,8 @@ again: | |||
621 | src_pte = pte_offset_map_nested(src_pmd, addr); | 655 | src_pte = pte_offset_map_nested(src_pmd, addr); |
622 | src_ptl = pte_lockptr(src_mm, src_pmd); | 656 | src_ptl = pte_lockptr(src_mm, src_pmd); |
623 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 657 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
658 | orig_src_pte = src_pte; | ||
659 | orig_dst_pte = dst_pte; | ||
624 | arch_enter_lazy_mmu_mode(); | 660 | arch_enter_lazy_mmu_mode(); |
625 | 661 | ||
626 | do { | 662 | do { |
@@ -644,9 +680,9 @@ again: | |||
644 | 680 | ||
645 | arch_leave_lazy_mmu_mode(); | 681 | arch_leave_lazy_mmu_mode(); |
646 | spin_unlock(src_ptl); | 682 | spin_unlock(src_ptl); |
647 | pte_unmap_nested(src_pte - 1); | 683 | pte_unmap_nested(orig_src_pte); |
648 | add_mm_rss(dst_mm, rss[0], rss[1]); | 684 | add_mm_rss(dst_mm, rss[0], rss[1]); |
649 | pte_unmap_unlock(dst_pte - 1, dst_ptl); | 685 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
650 | cond_resched(); | 686 | cond_resched(); |
651 | if (addr != end) | 687 | if (addr != end) |
652 | goto again; | 688 | goto again; |
@@ -1142,9 +1178,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1142 | goto no_page; | 1178 | goto no_page; |
1143 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1179 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
1144 | goto unlock; | 1180 | goto unlock; |
1181 | |||
1145 | page = vm_normal_page(vma, address, pte); | 1182 | page = vm_normal_page(vma, address, pte); |
1146 | if (unlikely(!page)) | 1183 | if (unlikely(!page)) { |
1147 | goto bad_page; | 1184 | if ((flags & FOLL_DUMP) || |
1185 | !is_zero_pfn(pte_pfn(pte))) | ||
1186 | goto bad_page; | ||
1187 | page = pte_page(pte); | ||
1188 | } | ||
1148 | 1189 | ||
1149 | if (flags & FOLL_GET) | 1190 | if (flags & FOLL_GET) |
1150 | get_page(page); | 1191 | get_page(page); |
@@ -1172,65 +1213,46 @@ no_page: | |||
1172 | pte_unmap_unlock(ptep, ptl); | 1213 | pte_unmap_unlock(ptep, ptl); |
1173 | if (!pte_none(pte)) | 1214 | if (!pte_none(pte)) |
1174 | return page; | 1215 | return page; |
1175 | /* Fall through to ZERO_PAGE handling */ | 1216 | |
1176 | no_page_table: | 1217 | no_page_table: |
1177 | /* | 1218 | /* |
1178 | * When core dumping an enormous anonymous area that nobody | 1219 | * When core dumping an enormous anonymous area that nobody |
1179 | * has touched so far, we don't want to allocate page tables. | 1220 | * has touched so far, we don't want to allocate unnecessary pages or |
1221 | * page tables. Return error instead of NULL to skip handle_mm_fault, | ||
1222 | * then get_dump_page() will return NULL to leave a hole in the dump. | ||
1223 | * But we can only make this optimization where a hole would surely | ||
1224 | * be zero-filled if handle_mm_fault() actually did handle it. | ||
1180 | */ | 1225 | */ |
1181 | if (flags & FOLL_ANON) { | 1226 | if ((flags & FOLL_DUMP) && |
1182 | page = ZERO_PAGE(0); | 1227 | (!vma->vm_ops || !vma->vm_ops->fault)) |
1183 | if (flags & FOLL_GET) | 1228 | return ERR_PTR(-EFAULT); |
1184 | get_page(page); | ||
1185 | BUG_ON(flags & FOLL_WRITE); | ||
1186 | } | ||
1187 | return page; | 1229 | return page; |
1188 | } | 1230 | } |
1189 | 1231 | ||
1190 | /* Can we do the FOLL_ANON optimization? */ | ||
1191 | static inline int use_zero_page(struct vm_area_struct *vma) | ||
1192 | { | ||
1193 | /* | ||
1194 | * We don't want to optimize FOLL_ANON for make_pages_present() | ||
1195 | * when it tries to page in a VM_LOCKED region. As to VM_SHARED, | ||
1196 | * we want to get the page from the page tables to make sure | ||
1197 | * that we serialize and update with any other user of that | ||
1198 | * mapping. | ||
1199 | */ | ||
1200 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | ||
1201 | return 0; | ||
1202 | /* | ||
1203 | * And if we have a fault routine, it's not an anonymous region. | ||
1204 | */ | ||
1205 | return !vma->vm_ops || !vma->vm_ops->fault; | ||
1206 | } | ||
1207 | |||
1208 | |||
1209 | |||
1210 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1232 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1211 | unsigned long start, int nr_pages, int flags, | 1233 | unsigned long start, int nr_pages, unsigned int gup_flags, |
1212 | struct page **pages, struct vm_area_struct **vmas) | 1234 | struct page **pages, struct vm_area_struct **vmas) |
1213 | { | 1235 | { |
1214 | int i; | 1236 | int i; |
1215 | unsigned int vm_flags = 0; | 1237 | unsigned long vm_flags; |
1216 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
1217 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
1218 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
1219 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
1220 | 1238 | ||
1221 | if (nr_pages <= 0) | 1239 | if (nr_pages <= 0) |
1222 | return 0; | 1240 | return 0; |
1241 | |||
1242 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | ||
1243 | |||
1223 | /* | 1244 | /* |
1224 | * Require read or write permissions. | 1245 | * Require read or write permissions. |
1225 | * If 'force' is set, we only require the "MAY" flags. | 1246 | * If FOLL_FORCE is set, we only require the "MAY" flags. |
1226 | */ | 1247 | */ |
1227 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1248 | vm_flags = (gup_flags & FOLL_WRITE) ? |
1228 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1249 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
1250 | vm_flags &= (gup_flags & FOLL_FORCE) ? | ||
1251 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
1229 | i = 0; | 1252 | i = 0; |
1230 | 1253 | ||
1231 | do { | 1254 | do { |
1232 | struct vm_area_struct *vma; | 1255 | struct vm_area_struct *vma; |
1233 | unsigned int foll_flags; | ||
1234 | 1256 | ||
1235 | vma = find_extend_vma(mm, start); | 1257 | vma = find_extend_vma(mm, start); |
1236 | if (!vma && in_gate_area(tsk, start)) { | 1258 | if (!vma && in_gate_area(tsk, start)) { |
@@ -1242,7 +1264,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1242 | pte_t *pte; | 1264 | pte_t *pte; |
1243 | 1265 | ||
1244 | /* user gate pages are read-only */ | 1266 | /* user gate pages are read-only */ |
1245 | if (!ignore && write) | 1267 | if (gup_flags & FOLL_WRITE) |
1246 | return i ? : -EFAULT; | 1268 | return i ? : -EFAULT; |
1247 | if (pg > TASK_SIZE) | 1269 | if (pg > TASK_SIZE) |
1248 | pgd = pgd_offset_k(pg); | 1270 | pgd = pgd_offset_k(pg); |
@@ -1276,38 +1298,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1276 | 1298 | ||
1277 | if (!vma || | 1299 | if (!vma || |
1278 | (vma->vm_flags & (VM_IO | VM_PFNMAP)) || | 1300 | (vma->vm_flags & (VM_IO | VM_PFNMAP)) || |
1279 | (!ignore && !(vm_flags & vma->vm_flags))) | 1301 | !(vm_flags & vma->vm_flags)) |
1280 | return i ? : -EFAULT; | 1302 | return i ? : -EFAULT; |
1281 | 1303 | ||
1282 | if (is_vm_hugetlb_page(vma)) { | 1304 | if (is_vm_hugetlb_page(vma)) { |
1283 | i = follow_hugetlb_page(mm, vma, pages, vmas, | 1305 | i = follow_hugetlb_page(mm, vma, pages, vmas, |
1284 | &start, &nr_pages, i, write); | 1306 | &start, &nr_pages, i, gup_flags); |
1285 | continue; | 1307 | continue; |
1286 | } | 1308 | } |
1287 | 1309 | ||
1288 | foll_flags = FOLL_TOUCH; | ||
1289 | if (pages) | ||
1290 | foll_flags |= FOLL_GET; | ||
1291 | if (!write && use_zero_page(vma)) | ||
1292 | foll_flags |= FOLL_ANON; | ||
1293 | |||
1294 | do { | 1310 | do { |
1295 | struct page *page; | 1311 | struct page *page; |
1312 | unsigned int foll_flags = gup_flags; | ||
1296 | 1313 | ||
1297 | /* | 1314 | /* |
1298 | * If we have a pending SIGKILL, don't keep faulting | 1315 | * If we have a pending SIGKILL, don't keep faulting |
1299 | * pages and potentially allocating memory, unless | 1316 | * pages and potentially allocating memory. |
1300 | * current is handling munlock--e.g., on exit. In | ||
1301 | * that case, we are not allocating memory. Rather, | ||
1302 | * we're only unlocking already resident/mapped pages. | ||
1303 | */ | 1317 | */ |
1304 | if (unlikely(!ignore_sigkill && | 1318 | if (unlikely(fatal_signal_pending(current))) |
1305 | fatal_signal_pending(current))) | ||
1306 | return i ? i : -ERESTARTSYS; | 1319 | return i ? i : -ERESTARTSYS; |
1307 | 1320 | ||
1308 | if (write) | ||
1309 | foll_flags |= FOLL_WRITE; | ||
1310 | |||
1311 | cond_resched(); | 1321 | cond_resched(); |
1312 | while (!(page = follow_page(vma, start, foll_flags))) { | 1322 | while (!(page = follow_page(vma, start, foll_flags))) { |
1313 | int ret; | 1323 | int ret; |
@@ -1319,7 +1329,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1319 | if (ret & VM_FAULT_ERROR) { | 1329 | if (ret & VM_FAULT_ERROR) { |
1320 | if (ret & VM_FAULT_OOM) | 1330 | if (ret & VM_FAULT_OOM) |
1321 | return i ? i : -ENOMEM; | 1331 | return i ? i : -ENOMEM; |
1322 | else if (ret & VM_FAULT_SIGBUS) | 1332 | if (ret & |
1333 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | ||
1323 | return i ? i : -EFAULT; | 1334 | return i ? i : -EFAULT; |
1324 | BUG(); | 1335 | BUG(); |
1325 | } | 1336 | } |
@@ -1418,18 +1429,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1418 | unsigned long start, int nr_pages, int write, int force, | 1429 | unsigned long start, int nr_pages, int write, int force, |
1419 | struct page **pages, struct vm_area_struct **vmas) | 1430 | struct page **pages, struct vm_area_struct **vmas) |
1420 | { | 1431 | { |
1421 | int flags = 0; | 1432 | int flags = FOLL_TOUCH; |
1422 | 1433 | ||
1434 | if (pages) | ||
1435 | flags |= FOLL_GET; | ||
1423 | if (write) | 1436 | if (write) |
1424 | flags |= GUP_FLAGS_WRITE; | 1437 | flags |= FOLL_WRITE; |
1425 | if (force) | 1438 | if (force) |
1426 | flags |= GUP_FLAGS_FORCE; | 1439 | flags |= FOLL_FORCE; |
1427 | 1440 | ||
1428 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1441 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); |
1429 | } | 1442 | } |
1430 | |||
1431 | EXPORT_SYMBOL(get_user_pages); | 1443 | EXPORT_SYMBOL(get_user_pages); |
1432 | 1444 | ||
1445 | /** | ||
1446 | * get_dump_page() - pin user page in memory while writing it to core dump | ||
1447 | * @addr: user address | ||
1448 | * | ||
1449 | * Returns struct page pointer of user page pinned for dump, | ||
1450 | * to be freed afterwards by page_cache_release() or put_page(). | ||
1451 | * | ||
1452 | * Returns NULL on any kind of failure - a hole must then be inserted into | ||
1453 | * the corefile, to preserve alignment with its headers; and also returns | ||
1454 | * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - | ||
1455 | * allowing a hole to be left in the corefile to save diskspace. | ||
1456 | * | ||
1457 | * Called without mmap_sem, but after all other threads have been killed. | ||
1458 | */ | ||
1459 | #ifdef CONFIG_ELF_CORE | ||
1460 | struct page *get_dump_page(unsigned long addr) | ||
1461 | { | ||
1462 | struct vm_area_struct *vma; | ||
1463 | struct page *page; | ||
1464 | |||
1465 | if (__get_user_pages(current, current->mm, addr, 1, | ||
1466 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | ||
1467 | return NULL; | ||
1468 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
1469 | return page; | ||
1470 | } | ||
1471 | #endif /* CONFIG_ELF_CORE */ | ||
1472 | |||
1433 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1473 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, |
1434 | spinlock_t **ptl) | 1474 | spinlock_t **ptl) |
1435 | { | 1475 | { |
@@ -1607,7 +1647,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | |||
1607 | * If we don't have pte special, then we have to use the pfn_valid() | 1647 | * If we don't have pte special, then we have to use the pfn_valid() |
1608 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* | 1648 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* |
1609 | * refcount the page if pfn_valid is true (hence insert_page rather | 1649 | * refcount the page if pfn_valid is true (hence insert_page rather |
1610 | * than insert_pfn). | 1650 | * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP |
1651 | * without pte special, it would there be refcounted as a normal page. | ||
1611 | */ | 1652 | */ |
1612 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { | 1653 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { |
1613 | struct page *page; | 1654 | struct page *page; |
@@ -1782,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1782 | token = pmd_pgtable(*pmd); | 1823 | token = pmd_pgtable(*pmd); |
1783 | 1824 | ||
1784 | do { | 1825 | do { |
1785 | err = fn(pte, token, addr, data); | 1826 | err = fn(pte++, token, addr, data); |
1786 | if (err) | 1827 | if (err) |
1787 | break; | 1828 | break; |
1788 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1829 | } while (addr += PAGE_SIZE, addr != end); |
1789 | 1830 | ||
1790 | arch_leave_lazy_mmu_mode(); | 1831 | arch_leave_lazy_mmu_mode(); |
1791 | 1832 | ||
@@ -1973,7 +2014,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1973 | * Take out anonymous pages first, anonymous shared vmas are | 2014 | * Take out anonymous pages first, anonymous shared vmas are |
1974 | * not dirty accountable. | 2015 | * not dirty accountable. |
1975 | */ | 2016 | */ |
1976 | if (PageAnon(old_page)) { | 2017 | if (PageAnon(old_page) && !PageKsm(old_page)) { |
1977 | if (!trylock_page(old_page)) { | 2018 | if (!trylock_page(old_page)) { |
1978 | page_cache_get(old_page); | 2019 | page_cache_get(old_page); |
1979 | pte_unmap_unlock(page_table, ptl); | 2020 | pte_unmap_unlock(page_table, ptl); |
@@ -2074,10 +2115,19 @@ gotten: | |||
2074 | 2115 | ||
2075 | if (unlikely(anon_vma_prepare(vma))) | 2116 | if (unlikely(anon_vma_prepare(vma))) |
2076 | goto oom; | 2117 | goto oom; |
2077 | VM_BUG_ON(old_page == ZERO_PAGE(0)); | 2118 | |
2078 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2119 | if (is_zero_pfn(pte_pfn(orig_pte))) { |
2079 | if (!new_page) | 2120 | new_page = alloc_zeroed_user_highpage_movable(vma, address); |
2080 | goto oom; | 2121 | if (!new_page) |
2122 | goto oom; | ||
2123 | } else { | ||
2124 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
2125 | if (!new_page) | ||
2126 | goto oom; | ||
2127 | cow_user_page(new_page, old_page, address, vma); | ||
2128 | } | ||
2129 | __SetPageUptodate(new_page); | ||
2130 | |||
2081 | /* | 2131 | /* |
2082 | * Don't let another task, with possibly unlocked vma, | 2132 | * Don't let another task, with possibly unlocked vma, |
2083 | * keep the mlocked page. | 2133 | * keep the mlocked page. |
@@ -2087,8 +2137,6 @@ gotten: | |||
2087 | clear_page_mlock(old_page); | 2137 | clear_page_mlock(old_page); |
2088 | unlock_page(old_page); | 2138 | unlock_page(old_page); |
2089 | } | 2139 | } |
2090 | cow_user_page(new_page, old_page, address, vma); | ||
2091 | __SetPageUptodate(new_page); | ||
2092 | 2140 | ||
2093 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2141 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2094 | goto oom_free_new; | 2142 | goto oom_free_new; |
@@ -2114,9 +2162,14 @@ gotten: | |||
2114 | * seen in the presence of one thread doing SMC and another | 2162 | * seen in the presence of one thread doing SMC and another |
2115 | * thread doing COW. | 2163 | * thread doing COW. |
2116 | */ | 2164 | */ |
2117 | ptep_clear_flush_notify(vma, address, page_table); | 2165 | ptep_clear_flush(vma, address, page_table); |
2118 | page_add_new_anon_rmap(new_page, vma, address); | 2166 | page_add_new_anon_rmap(new_page, vma, address); |
2119 | set_pte_at(mm, address, page_table, entry); | 2167 | /* |
2168 | * We call the notify macro here because, when using secondary | ||
2169 | * mmu page tables (such as kvm shadow page tables), we want the | ||
2170 | * new page to be mapped directly into the secondary page table. | ||
2171 | */ | ||
2172 | set_pte_at_notify(mm, address, page_table, entry); | ||
2120 | update_mmu_cache(vma, address, entry); | 2173 | update_mmu_cache(vma, address, entry); |
2121 | if (old_page) { | 2174 | if (old_page) { |
2122 | /* | 2175 | /* |
@@ -2359,7 +2412,7 @@ restart: | |||
2359 | * @mapping: the address space containing mmaps to be unmapped. | 2412 | * @mapping: the address space containing mmaps to be unmapped. |
2360 | * @holebegin: byte in first page to unmap, relative to the start of | 2413 | * @holebegin: byte in first page to unmap, relative to the start of |
2361 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2414 | * the underlying file. This will be rounded down to a PAGE_SIZE |
2362 | * boundary. Note that this is different from vmtruncate(), which | 2415 | * boundary. Note that this is different from truncate_pagecache(), which |
2363 | * must keep the partial page. In contrast, we must get rid of | 2416 | * must keep the partial page. In contrast, we must get rid of |
2364 | * partial pages. | 2417 | * partial pages. |
2365 | * @holelen: size of prospective hole in bytes. This will be rounded | 2418 | * @holelen: size of prospective hole in bytes. This will be rounded |
@@ -2410,63 +2463,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2410 | } | 2463 | } |
2411 | EXPORT_SYMBOL(unmap_mapping_range); | 2464 | EXPORT_SYMBOL(unmap_mapping_range); |
2412 | 2465 | ||
2413 | /** | ||
2414 | * vmtruncate - unmap mappings "freed" by truncate() syscall | ||
2415 | * @inode: inode of the file used | ||
2416 | * @offset: file offset to start truncating | ||
2417 | * | ||
2418 | * NOTE! We have to be ready to update the memory sharing | ||
2419 | * between the file and the memory map for a potential last | ||
2420 | * incomplete page. Ugly, but necessary. | ||
2421 | */ | ||
2422 | int vmtruncate(struct inode * inode, loff_t offset) | ||
2423 | { | ||
2424 | if (inode->i_size < offset) { | ||
2425 | unsigned long limit; | ||
2426 | |||
2427 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
2428 | if (limit != RLIM_INFINITY && offset > limit) | ||
2429 | goto out_sig; | ||
2430 | if (offset > inode->i_sb->s_maxbytes) | ||
2431 | goto out_big; | ||
2432 | i_size_write(inode, offset); | ||
2433 | } else { | ||
2434 | struct address_space *mapping = inode->i_mapping; | ||
2435 | |||
2436 | /* | ||
2437 | * truncation of in-use swapfiles is disallowed - it would | ||
2438 | * cause subsequent swapout to scribble on the now-freed | ||
2439 | * blocks. | ||
2440 | */ | ||
2441 | if (IS_SWAPFILE(inode)) | ||
2442 | return -ETXTBSY; | ||
2443 | i_size_write(inode, offset); | ||
2444 | |||
2445 | /* | ||
2446 | * unmap_mapping_range is called twice, first simply for | ||
2447 | * efficiency so that truncate_inode_pages does fewer | ||
2448 | * single-page unmaps. However after this first call, and | ||
2449 | * before truncate_inode_pages finishes, it is possible for | ||
2450 | * private pages to be COWed, which remain after | ||
2451 | * truncate_inode_pages finishes, hence the second | ||
2452 | * unmap_mapping_range call must be made for correctness. | ||
2453 | */ | ||
2454 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
2455 | truncate_inode_pages(mapping, offset); | ||
2456 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
2457 | } | ||
2458 | |||
2459 | if (inode->i_op->truncate) | ||
2460 | inode->i_op->truncate(inode); | ||
2461 | return 0; | ||
2462 | |||
2463 | out_sig: | ||
2464 | send_sig(SIGXFSZ, current, 0); | ||
2465 | out_big: | ||
2466 | return -EFBIG; | ||
2467 | } | ||
2468 | EXPORT_SYMBOL(vmtruncate); | ||
2469 | |||
2470 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | 2466 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) |
2471 | { | 2467 | { |
2472 | struct address_space *mapping = inode->i_mapping; | 2468 | struct address_space *mapping = inode->i_mapping; |
@@ -2511,8 +2507,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2511 | goto out; | 2507 | goto out; |
2512 | 2508 | ||
2513 | entry = pte_to_swp_entry(orig_pte); | 2509 | entry = pte_to_swp_entry(orig_pte); |
2514 | if (is_migration_entry(entry)) { | 2510 | if (unlikely(non_swap_entry(entry))) { |
2515 | migration_entry_wait(mm, pmd, address); | 2511 | if (is_migration_entry(entry)) { |
2512 | migration_entry_wait(mm, pmd, address); | ||
2513 | } else if (is_hwpoison_entry(entry)) { | ||
2514 | ret = VM_FAULT_HWPOISON; | ||
2515 | } else { | ||
2516 | print_bad_pte(vma, address, orig_pte, NULL); | ||
2517 | ret = VM_FAULT_OOM; | ||
2518 | } | ||
2516 | goto out; | 2519 | goto out; |
2517 | } | 2520 | } |
2518 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2521 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
@@ -2536,6 +2539,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2536 | /* Had to read the page from swap area: Major fault */ | 2539 | /* Had to read the page from swap area: Major fault */ |
2537 | ret = VM_FAULT_MAJOR; | 2540 | ret = VM_FAULT_MAJOR; |
2538 | count_vm_event(PGMAJFAULT); | 2541 | count_vm_event(PGMAJFAULT); |
2542 | } else if (PageHWPoison(page)) { | ||
2543 | ret = VM_FAULT_HWPOISON; | ||
2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
2545 | goto out_release; | ||
2539 | } | 2546 | } |
2540 | 2547 | ||
2541 | lock_page(page); | 2548 | lock_page(page); |
@@ -2607,6 +2614,7 @@ out_nomap: | |||
2607 | pte_unmap_unlock(page_table, ptl); | 2614 | pte_unmap_unlock(page_table, ptl); |
2608 | out_page: | 2615 | out_page: |
2609 | unlock_page(page); | 2616 | unlock_page(page); |
2617 | out_release: | ||
2610 | page_cache_release(page); | 2618 | page_cache_release(page); |
2611 | return ret; | 2619 | return ret; |
2612 | } | 2620 | } |
@@ -2624,6 +2632,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2624 | spinlock_t *ptl; | 2632 | spinlock_t *ptl; |
2625 | pte_t entry; | 2633 | pte_t entry; |
2626 | 2634 | ||
2635 | if (!(flags & FAULT_FLAG_WRITE)) { | ||
2636 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), | ||
2637 | vma->vm_page_prot)); | ||
2638 | ptl = pte_lockptr(mm, pmd); | ||
2639 | spin_lock(ptl); | ||
2640 | if (!pte_none(*page_table)) | ||
2641 | goto unlock; | ||
2642 | goto setpte; | ||
2643 | } | ||
2644 | |||
2627 | /* Allocate our own private page. */ | 2645 | /* Allocate our own private page. */ |
2628 | pte_unmap(page_table); | 2646 | pte_unmap(page_table); |
2629 | 2647 | ||
@@ -2638,13 +2656,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2638 | goto oom_free_page; | 2656 | goto oom_free_page; |
2639 | 2657 | ||
2640 | entry = mk_pte(page, vma->vm_page_prot); | 2658 | entry = mk_pte(page, vma->vm_page_prot); |
2641 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2659 | if (vma->vm_flags & VM_WRITE) |
2660 | entry = pte_mkwrite(pte_mkdirty(entry)); | ||
2642 | 2661 | ||
2643 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2662 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2644 | if (!pte_none(*page_table)) | 2663 | if (!pte_none(*page_table)) |
2645 | goto release; | 2664 | goto release; |
2665 | |||
2646 | inc_mm_counter(mm, anon_rss); | 2666 | inc_mm_counter(mm, anon_rss); |
2647 | page_add_new_anon_rmap(page, vma, address); | 2667 | page_add_new_anon_rmap(page, vma, address); |
2668 | setpte: | ||
2648 | set_pte_at(mm, address, page_table, entry); | 2669 | set_pte_at(mm, address, page_table, entry); |
2649 | 2670 | ||
2650 | /* No need to invalidate - it was non-present before */ | 2671 | /* No need to invalidate - it was non-present before */ |
@@ -2699,6 +2720,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2699 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2720 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2700 | return ret; | 2721 | return ret; |
2701 | 2722 | ||
2723 | if (unlikely(PageHWPoison(vmf.page))) { | ||
2724 | if (ret & VM_FAULT_LOCKED) | ||
2725 | unlock_page(vmf.page); | ||
2726 | return VM_FAULT_HWPOISON; | ||
2727 | } | ||
2728 | |||
2702 | /* | 2729 | /* |
2703 | * For consistency in subsequent calls, make the faulted page always | 2730 | * For consistency in subsequent calls, make the faulted page always |
2704 | * locked. | 2731 | * locked. |