diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 360 |
1 files changed, 269 insertions, 91 deletions
diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed1..8e8c1832486 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
394 | } | 394 | } |
395 | } | 395 | } |
396 | 396 | ||
397 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 397 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
398 | pmd_t *pmd, unsigned long address) | ||
398 | { | 399 | { |
399 | pgtable_t new = pte_alloc_one(mm, address); | 400 | pgtable_t new = pte_alloc_one(mm, address); |
401 | int wait_split_huge_page; | ||
400 | if (!new) | 402 | if (!new) |
401 | return -ENOMEM; | 403 | return -ENOMEM; |
402 | 404 | ||
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
416 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 418 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
417 | 419 | ||
418 | spin_lock(&mm->page_table_lock); | 420 | spin_lock(&mm->page_table_lock); |
419 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 421 | wait_split_huge_page = 0; |
422 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | ||
420 | mm->nr_ptes++; | 423 | mm->nr_ptes++; |
421 | pmd_populate(mm, pmd, new); | 424 | pmd_populate(mm, pmd, new); |
422 | new = NULL; | 425 | new = NULL; |
423 | } | 426 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
427 | wait_split_huge_page = 1; | ||
424 | spin_unlock(&mm->page_table_lock); | 428 | spin_unlock(&mm->page_table_lock); |
425 | if (new) | 429 | if (new) |
426 | pte_free(mm, new); | 430 | pte_free(mm, new); |
431 | if (wait_split_huge_page) | ||
432 | wait_split_huge_page(vma->anon_vma, pmd); | ||
427 | return 0; | 433 | return 0; |
428 | } | 434 | } |
429 | 435 | ||
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
436 | smp_wmb(); /* See comment in __pte_alloc */ | 442 | smp_wmb(); /* See comment in __pte_alloc */ |
437 | 443 | ||
438 | spin_lock(&init_mm.page_table_lock); | 444 | spin_lock(&init_mm.page_table_lock); |
439 | if (!pmd_present(*pmd)) { /* Has another populated it ? */ | 445 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
440 | pmd_populate_kernel(&init_mm, pmd, new); | 446 | pmd_populate_kernel(&init_mm, pmd, new); |
441 | new = NULL; | 447 | new = NULL; |
442 | } | 448 | } else |
449 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | ||
443 | spin_unlock(&init_mm.page_table_lock); | 450 | spin_unlock(&init_mm.page_table_lock); |
444 | if (new) | 451 | if (new) |
445 | pte_free_kernel(&init_mm, new); | 452 | pte_free_kernel(&init_mm, new); |
@@ -719,9 +726,9 @@ out_set_pte: | |||
719 | return 0; | 726 | return 0; |
720 | } | 727 | } |
721 | 728 | ||
722 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 729 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
723 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 730 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
724 | unsigned long addr, unsigned long end) | 731 | unsigned long addr, unsigned long end) |
725 | { | 732 | { |
726 | pte_t *orig_src_pte, *orig_dst_pte; | 733 | pte_t *orig_src_pte, *orig_dst_pte; |
727 | pte_t *src_pte, *dst_pte; | 734 | pte_t *src_pte, *dst_pte; |
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src | |||
795 | src_pmd = pmd_offset(src_pud, addr); | 802 | src_pmd = pmd_offset(src_pud, addr); |
796 | do { | 803 | do { |
797 | next = pmd_addr_end(addr, end); | 804 | next = pmd_addr_end(addr, end); |
805 | if (pmd_trans_huge(*src_pmd)) { | ||
806 | int err; | ||
807 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | ||
808 | err = copy_huge_pmd(dst_mm, src_mm, | ||
809 | dst_pmd, src_pmd, addr, vma); | ||
810 | if (err == -ENOMEM) | ||
811 | return -ENOMEM; | ||
812 | if (!err) | ||
813 | continue; | ||
814 | /* fall through */ | ||
815 | } | ||
798 | if (pmd_none_or_clear_bad(src_pmd)) | 816 | if (pmd_none_or_clear_bad(src_pmd)) |
799 | continue; | 817 | continue; |
800 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 818 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
997 | pmd = pmd_offset(pud, addr); | 1015 | pmd = pmd_offset(pud, addr); |
998 | do { | 1016 | do { |
999 | next = pmd_addr_end(addr, end); | 1017 | next = pmd_addr_end(addr, end); |
1018 | if (pmd_trans_huge(*pmd)) { | ||
1019 | if (next-addr != HPAGE_PMD_SIZE) { | ||
1020 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | ||
1021 | split_huge_page_pmd(vma->vm_mm, pmd); | ||
1022 | } else if (zap_huge_pmd(tlb, vma, pmd)) { | ||
1023 | (*zap_work)--; | ||
1024 | continue; | ||
1025 | } | ||
1026 | /* fall through */ | ||
1027 | } | ||
1000 | if (pmd_none_or_clear_bad(pmd)) { | 1028 | if (pmd_none_or_clear_bad(pmd)) { |
1001 | (*zap_work)--; | 1029 | (*zap_work)--; |
1002 | continue; | 1030 | continue; |
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1262 | pud = pud_offset(pgd, address); | 1290 | pud = pud_offset(pgd, address); |
1263 | if (pud_none(*pud)) | 1291 | if (pud_none(*pud)) |
1264 | goto no_page_table; | 1292 | goto no_page_table; |
1265 | if (pud_huge(*pud)) { | 1293 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
1266 | BUG_ON(flags & FOLL_GET); | 1294 | BUG_ON(flags & FOLL_GET); |
1267 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 1295 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); |
1268 | goto out; | 1296 | goto out; |
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1273 | pmd = pmd_offset(pud, address); | 1301 | pmd = pmd_offset(pud, address); |
1274 | if (pmd_none(*pmd)) | 1302 | if (pmd_none(*pmd)) |
1275 | goto no_page_table; | 1303 | goto no_page_table; |
1276 | if (pmd_huge(*pmd)) { | 1304 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
1277 | BUG_ON(flags & FOLL_GET); | 1305 | BUG_ON(flags & FOLL_GET); |
1278 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1306 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1279 | goto out; | 1307 | goto out; |
1280 | } | 1308 | } |
1309 | if (pmd_trans_huge(*pmd)) { | ||
1310 | if (flags & FOLL_SPLIT) { | ||
1311 | split_huge_page_pmd(mm, pmd); | ||
1312 | goto split_fallthrough; | ||
1313 | } | ||
1314 | spin_lock(&mm->page_table_lock); | ||
1315 | if (likely(pmd_trans_huge(*pmd))) { | ||
1316 | if (unlikely(pmd_trans_splitting(*pmd))) { | ||
1317 | spin_unlock(&mm->page_table_lock); | ||
1318 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1319 | } else { | ||
1320 | page = follow_trans_huge_pmd(mm, address, | ||
1321 | pmd, flags); | ||
1322 | spin_unlock(&mm->page_table_lock); | ||
1323 | goto out; | ||
1324 | } | ||
1325 | } else | ||
1326 | spin_unlock(&mm->page_table_lock); | ||
1327 | /* fall through */ | ||
1328 | } | ||
1329 | split_fallthrough: | ||
1281 | if (unlikely(pmd_bad(*pmd))) | 1330 | if (unlikely(pmd_bad(*pmd))) |
1282 | goto no_page_table; | 1331 | goto no_page_table; |
1283 | 1332 | ||
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1310 | */ | 1359 | */ |
1311 | mark_page_accessed(page); | 1360 | mark_page_accessed(page); |
1312 | } | 1361 | } |
1362 | if (flags & FOLL_MLOCK) { | ||
1363 | /* | ||
1364 | * The preliminary mapping check is mainly to avoid the | ||
1365 | * pointless overhead of lock_page on the ZERO_PAGE | ||
1366 | * which might bounce very badly if there is contention. | ||
1367 | * | ||
1368 | * If the page is already locked, we don't need to | ||
1369 | * handle it now - vmscan will handle it later if and | ||
1370 | * when it attempts to reclaim the page. | ||
1371 | */ | ||
1372 | if (page->mapping && trylock_page(page)) { | ||
1373 | lru_add_drain(); /* push cached pages to LRU */ | ||
1374 | /* | ||
1375 | * Because we lock page here and migration is | ||
1376 | * blocked by the pte's page reference, we need | ||
1377 | * only check for file-cache page truncation. | ||
1378 | */ | ||
1379 | if (page->mapping) | ||
1380 | mlock_vma_page(page); | ||
1381 | unlock_page(page); | ||
1382 | } | ||
1383 | } | ||
1313 | unlock: | 1384 | unlock: |
1314 | pte_unmap_unlock(ptep, ptl); | 1385 | pte_unmap_unlock(ptep, ptl); |
1315 | out: | 1386 | out: |
@@ -1341,7 +1412,8 @@ no_page_table: | |||
1341 | 1412 | ||
1342 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1343 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1414 | unsigned long start, int nr_pages, unsigned int gup_flags, |
1344 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas, |
1416 | int *nonblocking) | ||
1345 | { | 1417 | { |
1346 | int i; | 1418 | int i; |
1347 | unsigned long vm_flags; | 1419 | unsigned long vm_flags; |
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1386 | pmd = pmd_offset(pud, pg); | 1458 | pmd = pmd_offset(pud, pg); |
1387 | if (pmd_none(*pmd)) | 1459 | if (pmd_none(*pmd)) |
1388 | return i ? : -EFAULT; | 1460 | return i ? : -EFAULT; |
1461 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1389 | pte = pte_offset_map(pmd, pg); | 1462 | pte = pte_offset_map(pmd, pg); |
1390 | if (pte_none(*pte)) { | 1463 | if (pte_none(*pte)) { |
1391 | pte_unmap(pte); | 1464 | pte_unmap(pte); |
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1441 | cond_resched(); | 1514 | cond_resched(); |
1442 | while (!(page = follow_page(vma, start, foll_flags))) { | 1515 | while (!(page = follow_page(vma, start, foll_flags))) { |
1443 | int ret; | 1516 | int ret; |
1517 | unsigned int fault_flags = 0; | ||
1518 | |||
1519 | if (foll_flags & FOLL_WRITE) | ||
1520 | fault_flags |= FAULT_FLAG_WRITE; | ||
1521 | if (nonblocking) | ||
1522 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
1444 | 1523 | ||
1445 | ret = handle_mm_fault(mm, vma, start, | 1524 | ret = handle_mm_fault(mm, vma, start, |
1446 | (foll_flags & FOLL_WRITE) ? | 1525 | fault_flags); |
1447 | FAULT_FLAG_WRITE : 0); | ||
1448 | 1526 | ||
1449 | if (ret & VM_FAULT_ERROR) { | 1527 | if (ret & VM_FAULT_ERROR) { |
1450 | if (ret & VM_FAULT_OOM) | 1528 | if (ret & VM_FAULT_OOM) |
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1460 | else | 1538 | else |
1461 | tsk->min_flt++; | 1539 | tsk->min_flt++; |
1462 | 1540 | ||
1541 | if (ret & VM_FAULT_RETRY) { | ||
1542 | *nonblocking = 0; | ||
1543 | return i; | ||
1544 | } | ||
1545 | |||
1463 | /* | 1546 | /* |
1464 | * The VM_FAULT_WRITE bit tells us that | 1547 | * The VM_FAULT_WRITE bit tells us that |
1465 | * do_wp_page has broken COW when necessary, | 1548 | * do_wp_page has broken COW when necessary, |
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1559 | if (force) | 1642 | if (force) |
1560 | flags |= FOLL_FORCE; | 1643 | flags |= FOLL_FORCE; |
1561 | 1644 | ||
1562 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); | 1645 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, |
1646 | NULL); | ||
1563 | } | 1647 | } |
1564 | EXPORT_SYMBOL(get_user_pages); | 1648 | EXPORT_SYMBOL(get_user_pages); |
1565 | 1649 | ||
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr) | |||
1584 | struct page *page; | 1668 | struct page *page; |
1585 | 1669 | ||
1586 | if (__get_user_pages(current, current->mm, addr, 1, | 1670 | if (__get_user_pages(current, current->mm, addr, 1, |
1587 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | 1671 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, |
1672 | NULL) < 1) | ||
1588 | return NULL; | 1673 | return NULL; |
1589 | flush_cache_page(vma, addr, page_to_pfn(page)); | 1674 | flush_cache_page(vma, addr, page_to_pfn(page)); |
1590 | return page; | 1675 | return page; |
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | |||
1598 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1683 | pud_t * pud = pud_alloc(mm, pgd, addr); |
1599 | if (pud) { | 1684 | if (pud) { |
1600 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1685 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
1601 | if (pmd) | 1686 | if (pmd) { |
1687 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1602 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1688 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
1689 | } | ||
1603 | } | 1690 | } |
1604 | return NULL; | 1691 | return NULL; |
1605 | } | 1692 | } |
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1818 | pmd = pmd_alloc(mm, pud, addr); | 1905 | pmd = pmd_alloc(mm, pud, addr); |
1819 | if (!pmd) | 1906 | if (!pmd) |
1820 | return -ENOMEM; | 1907 | return -ENOMEM; |
1908 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
1821 | do { | 1909 | do { |
1822 | next = pmd_addr_end(addr, end); | 1910 | next = pmd_addr_end(addr, end); |
1823 | if (remap_pte_range(mm, pmd, addr, next, | 1911 | if (remap_pte_range(mm, pmd, addr, next, |
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2048 | return same; | 2136 | return same; |
2049 | } | 2137 | } |
2050 | 2138 | ||
2051 | /* | ||
2052 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | ||
2053 | * servicing faults for write access. In the normal case, do always want | ||
2054 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | ||
2055 | * that do not have writing enabled, when used by access_process_vm. | ||
2056 | */ | ||
2057 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | ||
2058 | { | ||
2059 | if (likely(vma->vm_flags & VM_WRITE)) | ||
2060 | pte = pte_mkwrite(pte); | ||
2061 | return pte; | ||
2062 | } | ||
2063 | |||
2064 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2139 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2065 | { | 2140 | { |
2066 | /* | 2141 | /* |
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2112 | { | 2187 | { |
2113 | struct page *old_page, *new_page; | 2188 | struct page *old_page, *new_page; |
2114 | pte_t entry; | 2189 | pte_t entry; |
2115 | int reuse = 0, ret = 0; | 2190 | int ret = 0; |
2116 | int page_mkwrite = 0; | 2191 | int page_mkwrite = 0; |
2117 | struct page *dirty_page = NULL; | 2192 | struct page *dirty_page = NULL; |
2118 | 2193 | ||
@@ -2144,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2144 | &ptl); | 2219 | &ptl); |
2145 | if (!pte_same(*page_table, orig_pte)) { | 2220 | if (!pte_same(*page_table, orig_pte)) { |
2146 | unlock_page(old_page); | 2221 | unlock_page(old_page); |
2147 | page_cache_release(old_page); | ||
2148 | goto unlock; | 2222 | goto unlock; |
2149 | } | 2223 | } |
2150 | page_cache_release(old_page); | 2224 | page_cache_release(old_page); |
2151 | } | 2225 | } |
2152 | reuse = reuse_swap_page(old_page); | 2226 | if (reuse_swap_page(old_page)) { |
2153 | if (reuse) | ||
2154 | /* | 2227 | /* |
2155 | * The page is all ours. Move it to our anon_vma so | 2228 | * The page is all ours. Move it to our anon_vma so |
2156 | * the rmap code will not search our parent or siblings. | 2229 | * the rmap code will not search our parent or siblings. |
2157 | * Protected against the rmap code by the page lock. | 2230 | * Protected against the rmap code by the page lock. |
2158 | */ | 2231 | */ |
2159 | page_move_anon_rmap(old_page, vma, address); | 2232 | page_move_anon_rmap(old_page, vma, address); |
2233 | unlock_page(old_page); | ||
2234 | goto reuse; | ||
2235 | } | ||
2160 | unlock_page(old_page); | 2236 | unlock_page(old_page); |
2161 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2237 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2162 | (VM_WRITE|VM_SHARED))) { | 2238 | (VM_WRITE|VM_SHARED))) { |
@@ -2212,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2212 | &ptl); | 2288 | &ptl); |
2213 | if (!pte_same(*page_table, orig_pte)) { | 2289 | if (!pte_same(*page_table, orig_pte)) { |
2214 | unlock_page(old_page); | 2290 | unlock_page(old_page); |
2215 | page_cache_release(old_page); | ||
2216 | goto unlock; | 2291 | goto unlock; |
2217 | } | 2292 | } |
2218 | 2293 | ||
@@ -2220,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2220 | } | 2295 | } |
2221 | dirty_page = old_page; | 2296 | dirty_page = old_page; |
2222 | get_page(dirty_page); | 2297 | get_page(dirty_page); |
2223 | reuse = 1; | ||
2224 | } | ||
2225 | 2298 | ||
2226 | if (reuse) { | ||
2227 | reuse: | 2299 | reuse: |
2228 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2300 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2229 | entry = pte_mkyoung(orig_pte); | 2301 | entry = pte_mkyoung(orig_pte); |
2230 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2302 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2231 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2303 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2232 | update_mmu_cache(vma, address, page_table); | 2304 | update_mmu_cache(vma, address, page_table); |
2305 | pte_unmap_unlock(page_table, ptl); | ||
2233 | ret |= VM_FAULT_WRITE; | 2306 | ret |= VM_FAULT_WRITE; |
2234 | goto unlock; | 2307 | |
2308 | if (!dirty_page) | ||
2309 | return ret; | ||
2310 | |||
2311 | /* | ||
2312 | * Yes, Virginia, this is actually required to prevent a race | ||
2313 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
2314 | * bit after it clear all dirty ptes, but before a racing | ||
2315 | * do_wp_page installs a dirty pte. | ||
2316 | * | ||
2317 | * do_no_page is protected similarly. | ||
2318 | */ | ||
2319 | if (!page_mkwrite) { | ||
2320 | wait_on_page_locked(dirty_page); | ||
2321 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
2322 | } | ||
2323 | put_page(dirty_page); | ||
2324 | if (page_mkwrite) { | ||
2325 | struct address_space *mapping = dirty_page->mapping; | ||
2326 | |||
2327 | set_page_dirty(dirty_page); | ||
2328 | unlock_page(dirty_page); | ||
2329 | page_cache_release(dirty_page); | ||
2330 | if (mapping) { | ||
2331 | /* | ||
2332 | * Some device drivers do not set page.mapping | ||
2333 | * but still dirty their pages | ||
2334 | */ | ||
2335 | balance_dirty_pages_ratelimited(mapping); | ||
2336 | } | ||
2337 | } | ||
2338 | |||
2339 | /* file_update_time outside page_lock */ | ||
2340 | if (vma->vm_file) | ||
2341 | file_update_time(vma->vm_file); | ||
2342 | |||
2343 | return ret; | ||
2235 | } | 2344 | } |
2236 | 2345 | ||
2237 | /* | 2346 | /* |
@@ -2256,16 +2365,6 @@ gotten: | |||
2256 | } | 2365 | } |
2257 | __SetPageUptodate(new_page); | 2366 | __SetPageUptodate(new_page); |
2258 | 2367 | ||
2259 | /* | ||
2260 | * Don't let another task, with possibly unlocked vma, | ||
2261 | * keep the mlocked page. | ||
2262 | */ | ||
2263 | if ((vma->vm_flags & VM_LOCKED) && old_page) { | ||
2264 | lock_page(old_page); /* for LRU manipulation */ | ||
2265 | clear_page_mlock(old_page); | ||
2266 | unlock_page(old_page); | ||
2267 | } | ||
2268 | |||
2269 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2368 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2270 | goto oom_free_new; | 2369 | goto oom_free_new; |
2271 | 2370 | ||
@@ -2333,42 +2432,19 @@ gotten: | |||
2333 | 2432 | ||
2334 | if (new_page) | 2433 | if (new_page) |
2335 | page_cache_release(new_page); | 2434 | page_cache_release(new_page); |
2336 | if (old_page) | ||
2337 | page_cache_release(old_page); | ||
2338 | unlock: | 2435 | unlock: |
2339 | pte_unmap_unlock(page_table, ptl); | 2436 | pte_unmap_unlock(page_table, ptl); |
2340 | if (dirty_page) { | 2437 | if (old_page) { |
2341 | /* | 2438 | /* |
2342 | * Yes, Virginia, this is actually required to prevent a race | 2439 | * Don't let another task, with possibly unlocked vma, |
2343 | * with clear_page_dirty_for_io() from clearing the page dirty | 2440 | * keep the mlocked page. |
2344 | * bit after it clear all dirty ptes, but before a racing | ||
2345 | * do_wp_page installs a dirty pte. | ||
2346 | * | ||
2347 | * do_no_page is protected similarly. | ||
2348 | */ | 2441 | */ |
2349 | if (!page_mkwrite) { | 2442 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { |
2350 | wait_on_page_locked(dirty_page); | 2443 | lock_page(old_page); /* LRU manipulation */ |
2351 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2444 | munlock_vma_page(old_page); |
2352 | } | 2445 | unlock_page(old_page); |
2353 | put_page(dirty_page); | ||
2354 | if (page_mkwrite) { | ||
2355 | struct address_space *mapping = dirty_page->mapping; | ||
2356 | |||
2357 | set_page_dirty(dirty_page); | ||
2358 | unlock_page(dirty_page); | ||
2359 | page_cache_release(dirty_page); | ||
2360 | if (mapping) { | ||
2361 | /* | ||
2362 | * Some device drivers do not set page.mapping | ||
2363 | * but still dirty their pages | ||
2364 | */ | ||
2365 | balance_dirty_pages_ratelimited(mapping); | ||
2366 | } | ||
2367 | } | 2446 | } |
2368 | 2447 | page_cache_release(old_page); | |
2369 | /* file_update_time outside page_lock */ | ||
2370 | if (vma->vm_file) | ||
2371 | file_update_time(vma->vm_file); | ||
2372 | } | 2448 | } |
2373 | return ret; | 2449 | return ret; |
2374 | oom_free_new: | 2450 | oom_free_new: |
@@ -2975,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2975 | goto out; | 3051 | goto out; |
2976 | } | 3052 | } |
2977 | charged = 1; | 3053 | charged = 1; |
2978 | /* | ||
2979 | * Don't let another task, with possibly unlocked vma, | ||
2980 | * keep the mlocked page. | ||
2981 | */ | ||
2982 | if (vma->vm_flags & VM_LOCKED) | ||
2983 | clear_page_mlock(vmf.page); | ||
2984 | copy_user_highpage(page, vmf.page, address, vma); | 3054 | copy_user_highpage(page, vmf.page, address, vma); |
2985 | __SetPageUptodate(page); | 3055 | __SetPageUptodate(page); |
2986 | } else { | 3056 | } else { |
@@ -3147,9 +3217,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3147 | * but allow concurrent faults), and pte mapped but not yet locked. | 3217 | * but allow concurrent faults), and pte mapped but not yet locked. |
3148 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3218 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
3149 | */ | 3219 | */ |
3150 | static inline int handle_pte_fault(struct mm_struct *mm, | 3220 | int handle_pte_fault(struct mm_struct *mm, |
3151 | struct vm_area_struct *vma, unsigned long address, | 3221 | struct vm_area_struct *vma, unsigned long address, |
3152 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3222 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3153 | { | 3223 | { |
3154 | pte_t entry; | 3224 | pte_t entry; |
3155 | spinlock_t *ptl; | 3225 | spinlock_t *ptl; |
@@ -3228,9 +3298,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3228 | pmd = pmd_alloc(mm, pud, address); | 3298 | pmd = pmd_alloc(mm, pud, address); |
3229 | if (!pmd) | 3299 | if (!pmd) |
3230 | return VM_FAULT_OOM; | 3300 | return VM_FAULT_OOM; |
3231 | pte = pte_alloc_map(mm, pmd, address); | 3301 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3232 | if (!pte) | 3302 | if (!vma->vm_ops) |
3303 | return do_huge_pmd_anonymous_page(mm, vma, address, | ||
3304 | pmd, flags); | ||
3305 | } else { | ||
3306 | pmd_t orig_pmd = *pmd; | ||
3307 | barrier(); | ||
3308 | if (pmd_trans_huge(orig_pmd)) { | ||
3309 | if (flags & FAULT_FLAG_WRITE && | ||
3310 | !pmd_write(orig_pmd) && | ||
3311 | !pmd_trans_splitting(orig_pmd)) | ||
3312 | return do_huge_pmd_wp_page(mm, vma, address, | ||
3313 | pmd, orig_pmd); | ||
3314 | return 0; | ||
3315 | } | ||
3316 | } | ||
3317 | |||
3318 | /* | ||
3319 | * Use __pte_alloc instead of pte_alloc_map, because we can't | ||
3320 | * run pte_offset_map on the pmd, if an huge pmd could | ||
3321 | * materialize from under us from a different thread. | ||
3322 | */ | ||
3323 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3233 | return VM_FAULT_OOM; | 3324 | return VM_FAULT_OOM; |
3325 | /* if an huge pmd materialized from under us just retry later */ | ||
3326 | if (unlikely(pmd_trans_huge(*pmd))) | ||
3327 | return 0; | ||
3328 | /* | ||
3329 | * A regular pmd is established and it can't morph into a huge pmd | ||
3330 | * from under us anymore at this point because we hold the mmap_sem | ||
3331 | * read mode and khugepaged takes it in write mode. So now it's | ||
3332 | * safe to run pte_offset_map(). | ||
3333 | */ | ||
3334 | pte = pte_offset_map(pmd, address); | ||
3234 | 3335 | ||
3235 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3336 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3236 | } | 3337 | } |
@@ -3296,7 +3397,12 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
3296 | vma = find_vma(current->mm, addr); | 3397 | vma = find_vma(current->mm, addr); |
3297 | if (!vma) | 3398 | if (!vma) |
3298 | return -ENOMEM; | 3399 | return -ENOMEM; |
3299 | write = (vma->vm_flags & VM_WRITE) != 0; | 3400 | /* |
3401 | * We want to touch writable mappings with a write fault in order | ||
3402 | * to break COW, except for shared mappings because these don't COW | ||
3403 | * and we would not want to dirty them for nothing. | ||
3404 | */ | ||
3405 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
3300 | BUG_ON(addr >= end); | 3406 | BUG_ON(addr >= end); |
3301 | BUG_ON(end > vma->vm_end); | 3407 | BUG_ON(end > vma->vm_end); |
3302 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 3408 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
@@ -3368,6 +3474,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, | |||
3368 | goto out; | 3474 | goto out; |
3369 | 3475 | ||
3370 | pmd = pmd_offset(pud, address); | 3476 | pmd = pmd_offset(pud, address); |
3477 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
3371 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 3478 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
3372 | goto out; | 3479 | goto out; |
3373 | 3480 | ||
@@ -3608,3 +3715,74 @@ void might_fault(void) | |||
3608 | } | 3715 | } |
3609 | EXPORT_SYMBOL(might_fault); | 3716 | EXPORT_SYMBOL(might_fault); |
3610 | #endif | 3717 | #endif |
3718 | |||
3719 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) | ||
3720 | static void clear_gigantic_page(struct page *page, | ||
3721 | unsigned long addr, | ||
3722 | unsigned int pages_per_huge_page) | ||
3723 | { | ||
3724 | int i; | ||
3725 | struct page *p = page; | ||
3726 | |||
3727 | might_sleep(); | ||
3728 | for (i = 0; i < pages_per_huge_page; | ||
3729 | i++, p = mem_map_next(p, page, i)) { | ||
3730 | cond_resched(); | ||
3731 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
3732 | } | ||
3733 | } | ||
3734 | void clear_huge_page(struct page *page, | ||
3735 | unsigned long addr, unsigned int pages_per_huge_page) | ||
3736 | { | ||
3737 | int i; | ||
3738 | |||
3739 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3740 | clear_gigantic_page(page, addr, pages_per_huge_page); | ||
3741 | return; | ||
3742 | } | ||
3743 | |||
3744 | might_sleep(); | ||
3745 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3746 | cond_resched(); | ||
3747 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
3748 | } | ||
3749 | } | ||
3750 | |||
3751 | static void copy_user_gigantic_page(struct page *dst, struct page *src, | ||
3752 | unsigned long addr, | ||
3753 | struct vm_area_struct *vma, | ||
3754 | unsigned int pages_per_huge_page) | ||
3755 | { | ||
3756 | int i; | ||
3757 | struct page *dst_base = dst; | ||
3758 | struct page *src_base = src; | ||
3759 | |||
3760 | for (i = 0; i < pages_per_huge_page; ) { | ||
3761 | cond_resched(); | ||
3762 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
3763 | |||
3764 | i++; | ||
3765 | dst = mem_map_next(dst, dst_base, i); | ||
3766 | src = mem_map_next(src, src_base, i); | ||
3767 | } | ||
3768 | } | ||
3769 | |||
3770 | void copy_user_huge_page(struct page *dst, struct page *src, | ||
3771 | unsigned long addr, struct vm_area_struct *vma, | ||
3772 | unsigned int pages_per_huge_page) | ||
3773 | { | ||
3774 | int i; | ||
3775 | |||
3776 | if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { | ||
3777 | copy_user_gigantic_page(dst, src, addr, vma, | ||
3778 | pages_per_huge_page); | ||
3779 | return; | ||
3780 | } | ||
3781 | |||
3782 | might_sleep(); | ||
3783 | for (i = 0; i < pages_per_huge_page; i++) { | ||
3784 | cond_resched(); | ||
3785 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | ||
3786 | } | ||
3787 | } | ||
3788 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | ||