aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c360
1 files changed, 269 insertions, 91 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed1..8e8c1832486 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
394 } 394 }
395} 395}
396 396
397int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 397int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
398 pmd_t *pmd, unsigned long address)
398{ 399{
399 pgtable_t new = pte_alloc_one(mm, address); 400 pgtable_t new = pte_alloc_one(mm, address);
401 int wait_split_huge_page;
400 if (!new) 402 if (!new)
401 return -ENOMEM; 403 return -ENOMEM;
402 404
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
416 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 418 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
417 419
418 spin_lock(&mm->page_table_lock); 420 spin_lock(&mm->page_table_lock);
419 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 421 wait_split_huge_page = 0;
422 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
420 mm->nr_ptes++; 423 mm->nr_ptes++;
421 pmd_populate(mm, pmd, new); 424 pmd_populate(mm, pmd, new);
422 new = NULL; 425 new = NULL;
423 } 426 } else if (unlikely(pmd_trans_splitting(*pmd)))
427 wait_split_huge_page = 1;
424 spin_unlock(&mm->page_table_lock); 428 spin_unlock(&mm->page_table_lock);
425 if (new) 429 if (new)
426 pte_free(mm, new); 430 pte_free(mm, new);
431 if (wait_split_huge_page)
432 wait_split_huge_page(vma->anon_vma, pmd);
427 return 0; 433 return 0;
428} 434}
429 435
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
436 smp_wmb(); /* See comment in __pte_alloc */ 442 smp_wmb(); /* See comment in __pte_alloc */
437 443
438 spin_lock(&init_mm.page_table_lock); 444 spin_lock(&init_mm.page_table_lock);
439 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 445 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
440 pmd_populate_kernel(&init_mm, pmd, new); 446 pmd_populate_kernel(&init_mm, pmd, new);
441 new = NULL; 447 new = NULL;
442 } 448 } else
449 VM_BUG_ON(pmd_trans_splitting(*pmd));
443 spin_unlock(&init_mm.page_table_lock); 450 spin_unlock(&init_mm.page_table_lock);
444 if (new) 451 if (new)
445 pte_free_kernel(&init_mm, new); 452 pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
719 return 0; 726 return 0;
720} 727}
721 728
722static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 729int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
723 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
724 unsigned long addr, unsigned long end) 731 unsigned long addr, unsigned long end)
725{ 732{
726 pte_t *orig_src_pte, *orig_dst_pte; 733 pte_t *orig_src_pte, *orig_dst_pte;
727 pte_t *src_pte, *dst_pte; 734 pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
795 src_pmd = pmd_offset(src_pud, addr); 802 src_pmd = pmd_offset(src_pud, addr);
796 do { 803 do {
797 next = pmd_addr_end(addr, end); 804 next = pmd_addr_end(addr, end);
805 if (pmd_trans_huge(*src_pmd)) {
806 int err;
807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
808 err = copy_huge_pmd(dst_mm, src_mm,
809 dst_pmd, src_pmd, addr, vma);
810 if (err == -ENOMEM)
811 return -ENOMEM;
812 if (!err)
813 continue;
814 /* fall through */
815 }
798 if (pmd_none_or_clear_bad(src_pmd)) 816 if (pmd_none_or_clear_bad(src_pmd))
799 continue; 817 continue;
800 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
997 pmd = pmd_offset(pud, addr); 1015 pmd = pmd_offset(pud, addr);
998 do { 1016 do {
999 next = pmd_addr_end(addr, end); 1017 next = pmd_addr_end(addr, end);
1018 if (pmd_trans_huge(*pmd)) {
1019 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) {
1023 (*zap_work)--;
1024 continue;
1025 }
1026 /* fall through */
1027 }
1000 if (pmd_none_or_clear_bad(pmd)) { 1028 if (pmd_none_or_clear_bad(pmd)) {
1001 (*zap_work)--; 1029 (*zap_work)--;
1002 continue; 1030 continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1262 pud = pud_offset(pgd, address); 1290 pud = pud_offset(pgd, address);
1263 if (pud_none(*pud)) 1291 if (pud_none(*pud))
1264 goto no_page_table; 1292 goto no_page_table;
1265 if (pud_huge(*pud)) { 1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1266 BUG_ON(flags & FOLL_GET); 1294 BUG_ON(flags & FOLL_GET);
1267 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1268 goto out; 1296 goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1273 pmd = pmd_offset(pud, address); 1301 pmd = pmd_offset(pud, address);
1274 if (pmd_none(*pmd)) 1302 if (pmd_none(*pmd))
1275 goto no_page_table; 1303 goto no_page_table;
1276 if (pmd_huge(*pmd)) { 1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1277 BUG_ON(flags & FOLL_GET); 1305 BUG_ON(flags & FOLL_GET);
1278 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1279 goto out; 1307 goto out;
1280 } 1308 }
1309 if (pmd_trans_huge(*pmd)) {
1310 if (flags & FOLL_SPLIT) {
1311 split_huge_page_pmd(mm, pmd);
1312 goto split_fallthrough;
1313 }
1314 spin_lock(&mm->page_table_lock);
1315 if (likely(pmd_trans_huge(*pmd))) {
1316 if (unlikely(pmd_trans_splitting(*pmd))) {
1317 spin_unlock(&mm->page_table_lock);
1318 wait_split_huge_page(vma->anon_vma, pmd);
1319 } else {
1320 page = follow_trans_huge_pmd(mm, address,
1321 pmd, flags);
1322 spin_unlock(&mm->page_table_lock);
1323 goto out;
1324 }
1325 } else
1326 spin_unlock(&mm->page_table_lock);
1327 /* fall through */
1328 }
1329split_fallthrough:
1281 if (unlikely(pmd_bad(*pmd))) 1330 if (unlikely(pmd_bad(*pmd)))
1282 goto no_page_table; 1331 goto no_page_table;
1283 1332
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1310 */ 1359 */
1311 mark_page_accessed(page); 1360 mark_page_accessed(page);
1312 } 1361 }
1362 if (flags & FOLL_MLOCK) {
1363 /*
1364 * The preliminary mapping check is mainly to avoid the
1365 * pointless overhead of lock_page on the ZERO_PAGE
1366 * which might bounce very badly if there is contention.
1367 *
1368 * If the page is already locked, we don't need to
1369 * handle it now - vmscan will handle it later if and
1370 * when it attempts to reclaim the page.
1371 */
1372 if (page->mapping && trylock_page(page)) {
1373 lru_add_drain(); /* push cached pages to LRU */
1374 /*
1375 * Because we lock page here and migration is
1376 * blocked by the pte's page reference, we need
1377 * only check for file-cache page truncation.
1378 */
1379 if (page->mapping)
1380 mlock_vma_page(page);
1381 unlock_page(page);
1382 }
1383 }
1313unlock: 1384unlock:
1314 pte_unmap_unlock(ptep, ptl); 1385 pte_unmap_unlock(ptep, ptl);
1315out: 1386out:
@@ -1341,7 +1412,8 @@ no_page_table:
1341 1412
1342int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1343 unsigned long start, int nr_pages, unsigned int gup_flags, 1414 unsigned long start, int nr_pages, unsigned int gup_flags,
1344 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas,
1416 int *nonblocking)
1345{ 1417{
1346 int i; 1418 int i;
1347 unsigned long vm_flags; 1419 unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1386 pmd = pmd_offset(pud, pg); 1458 pmd = pmd_offset(pud, pg);
1387 if (pmd_none(*pmd)) 1459 if (pmd_none(*pmd))
1388 return i ? : -EFAULT; 1460 return i ? : -EFAULT;
1461 VM_BUG_ON(pmd_trans_huge(*pmd));
1389 pte = pte_offset_map(pmd, pg); 1462 pte = pte_offset_map(pmd, pg);
1390 if (pte_none(*pte)) { 1463 if (pte_none(*pte)) {
1391 pte_unmap(pte); 1464 pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1441 cond_resched(); 1514 cond_resched();
1442 while (!(page = follow_page(vma, start, foll_flags))) { 1515 while (!(page = follow_page(vma, start, foll_flags))) {
1443 int ret; 1516 int ret;
1517 unsigned int fault_flags = 0;
1518
1519 if (foll_flags & FOLL_WRITE)
1520 fault_flags |= FAULT_FLAG_WRITE;
1521 if (nonblocking)
1522 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1444 1523
1445 ret = handle_mm_fault(mm, vma, start, 1524 ret = handle_mm_fault(mm, vma, start,
1446 (foll_flags & FOLL_WRITE) ? 1525 fault_flags);
1447 FAULT_FLAG_WRITE : 0);
1448 1526
1449 if (ret & VM_FAULT_ERROR) { 1527 if (ret & VM_FAULT_ERROR) {
1450 if (ret & VM_FAULT_OOM) 1528 if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1460 else 1538 else
1461 tsk->min_flt++; 1539 tsk->min_flt++;
1462 1540
1541 if (ret & VM_FAULT_RETRY) {
1542 *nonblocking = 0;
1543 return i;
1544 }
1545
1463 /* 1546 /*
1464 * The VM_FAULT_WRITE bit tells us that 1547 * The VM_FAULT_WRITE bit tells us that
1465 * do_wp_page has broken COW when necessary, 1548 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1559 if (force) 1642 if (force)
1560 flags |= FOLL_FORCE; 1643 flags |= FOLL_FORCE;
1561 1644
1562 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1645 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1646 NULL);
1563} 1647}
1564EXPORT_SYMBOL(get_user_pages); 1648EXPORT_SYMBOL(get_user_pages);
1565 1649
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
1584 struct page *page; 1668 struct page *page;
1585 1669
1586 if (__get_user_pages(current, current->mm, addr, 1, 1670 if (__get_user_pages(current, current->mm, addr, 1,
1587 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) 1671 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1672 NULL) < 1)
1588 return NULL; 1673 return NULL;
1589 flush_cache_page(vma, addr, page_to_pfn(page)); 1674 flush_cache_page(vma, addr, page_to_pfn(page));
1590 return page; 1675 return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1598 pud_t * pud = pud_alloc(mm, pgd, addr); 1683 pud_t * pud = pud_alloc(mm, pgd, addr);
1599 if (pud) { 1684 if (pud) {
1600 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1685 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1601 if (pmd) 1686 if (pmd) {
1687 VM_BUG_ON(pmd_trans_huge(*pmd));
1602 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1688 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1689 }
1603 } 1690 }
1604 return NULL; 1691 return NULL;
1605} 1692}
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1818 pmd = pmd_alloc(mm, pud, addr); 1905 pmd = pmd_alloc(mm, pud, addr);
1819 if (!pmd) 1906 if (!pmd)
1820 return -ENOMEM; 1907 return -ENOMEM;
1908 VM_BUG_ON(pmd_trans_huge(*pmd));
1821 do { 1909 do {
1822 next = pmd_addr_end(addr, end); 1910 next = pmd_addr_end(addr, end);
1823 if (remap_pte_range(mm, pmd, addr, next, 1911 if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2048 return same; 2136 return same;
2049} 2137}
2050 2138
2051/*
2052 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
2053 * servicing faults for write access. In the normal case, do always want
2054 * pte_mkwrite. But get_user_pages can cause write faults for mappings
2055 * that do not have writing enabled, when used by access_process_vm.
2056 */
2057static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
2058{
2059 if (likely(vma->vm_flags & VM_WRITE))
2060 pte = pte_mkwrite(pte);
2061 return pte;
2062}
2063
2064static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2139static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2065{ 2140{
2066 /* 2141 /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2112{ 2187{
2113 struct page *old_page, *new_page; 2188 struct page *old_page, *new_page;
2114 pte_t entry; 2189 pte_t entry;
2115 int reuse = 0, ret = 0; 2190 int ret = 0;
2116 int page_mkwrite = 0; 2191 int page_mkwrite = 0;
2117 struct page *dirty_page = NULL; 2192 struct page *dirty_page = NULL;
2118 2193
@@ -2144,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2144 &ptl); 2219 &ptl);
2145 if (!pte_same(*page_table, orig_pte)) { 2220 if (!pte_same(*page_table, orig_pte)) {
2146 unlock_page(old_page); 2221 unlock_page(old_page);
2147 page_cache_release(old_page);
2148 goto unlock; 2222 goto unlock;
2149 } 2223 }
2150 page_cache_release(old_page); 2224 page_cache_release(old_page);
2151 } 2225 }
2152 reuse = reuse_swap_page(old_page); 2226 if (reuse_swap_page(old_page)) {
2153 if (reuse)
2154 /* 2227 /*
2155 * The page is all ours. Move it to our anon_vma so 2228 * The page is all ours. Move it to our anon_vma so
2156 * the rmap code will not search our parent or siblings. 2229 * the rmap code will not search our parent or siblings.
2157 * Protected against the rmap code by the page lock. 2230 * Protected against the rmap code by the page lock.
2158 */ 2231 */
2159 page_move_anon_rmap(old_page, vma, address); 2232 page_move_anon_rmap(old_page, vma, address);
2233 unlock_page(old_page);
2234 goto reuse;
2235 }
2160 unlock_page(old_page); 2236 unlock_page(old_page);
2161 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2237 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2162 (VM_WRITE|VM_SHARED))) { 2238 (VM_WRITE|VM_SHARED))) {
@@ -2212,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2212 &ptl); 2288 &ptl);
2213 if (!pte_same(*page_table, orig_pte)) { 2289 if (!pte_same(*page_table, orig_pte)) {
2214 unlock_page(old_page); 2290 unlock_page(old_page);
2215 page_cache_release(old_page);
2216 goto unlock; 2291 goto unlock;
2217 } 2292 }
2218 2293
@@ -2220,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2220 } 2295 }
2221 dirty_page = old_page; 2296 dirty_page = old_page;
2222 get_page(dirty_page); 2297 get_page(dirty_page);
2223 reuse = 1;
2224 }
2225 2298
2226 if (reuse) {
2227reuse: 2299reuse:
2228 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2300 flush_cache_page(vma, address, pte_pfn(orig_pte));
2229 entry = pte_mkyoung(orig_pte); 2301 entry = pte_mkyoung(orig_pte);
2230 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2302 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2231 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2303 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2232 update_mmu_cache(vma, address, page_table); 2304 update_mmu_cache(vma, address, page_table);
2305 pte_unmap_unlock(page_table, ptl);
2233 ret |= VM_FAULT_WRITE; 2306 ret |= VM_FAULT_WRITE;
2234 goto unlock; 2307
2308 if (!dirty_page)
2309 return ret;
2310
2311 /*
2312 * Yes, Virginia, this is actually required to prevent a race
2313 * with clear_page_dirty_for_io() from clearing the page dirty
2314 * bit after it clear all dirty ptes, but before a racing
2315 * do_wp_page installs a dirty pte.
2316 *
2317 * do_no_page is protected similarly.
2318 */
2319 if (!page_mkwrite) {
2320 wait_on_page_locked(dirty_page);
2321 set_page_dirty_balance(dirty_page, page_mkwrite);
2322 }
2323 put_page(dirty_page);
2324 if (page_mkwrite) {
2325 struct address_space *mapping = dirty_page->mapping;
2326
2327 set_page_dirty(dirty_page);
2328 unlock_page(dirty_page);
2329 page_cache_release(dirty_page);
2330 if (mapping) {
2331 /*
2332 * Some device drivers do not set page.mapping
2333 * but still dirty their pages
2334 */
2335 balance_dirty_pages_ratelimited(mapping);
2336 }
2337 }
2338
2339 /* file_update_time outside page_lock */
2340 if (vma->vm_file)
2341 file_update_time(vma->vm_file);
2342
2343 return ret;
2235 } 2344 }
2236 2345
2237 /* 2346 /*
@@ -2256,16 +2365,6 @@ gotten:
2256 } 2365 }
2257 __SetPageUptodate(new_page); 2366 __SetPageUptodate(new_page);
2258 2367
2259 /*
2260 * Don't let another task, with possibly unlocked vma,
2261 * keep the mlocked page.
2262 */
2263 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2264 lock_page(old_page); /* for LRU manipulation */
2265 clear_page_mlock(old_page);
2266 unlock_page(old_page);
2267 }
2268
2269 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2368 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2270 goto oom_free_new; 2369 goto oom_free_new;
2271 2370
@@ -2333,42 +2432,19 @@ gotten:
2333 2432
2334 if (new_page) 2433 if (new_page)
2335 page_cache_release(new_page); 2434 page_cache_release(new_page);
2336 if (old_page)
2337 page_cache_release(old_page);
2338unlock: 2435unlock:
2339 pte_unmap_unlock(page_table, ptl); 2436 pte_unmap_unlock(page_table, ptl);
2340 if (dirty_page) { 2437 if (old_page) {
2341 /* 2438 /*
2342 * Yes, Virginia, this is actually required to prevent a race 2439 * Don't let another task, with possibly unlocked vma,
2343 * with clear_page_dirty_for_io() from clearing the page dirty 2440 * keep the mlocked page.
2344 * bit after it clear all dirty ptes, but before a racing
2345 * do_wp_page installs a dirty pte.
2346 *
2347 * do_no_page is protected similarly.
2348 */ 2441 */
2349 if (!page_mkwrite) { 2442 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2350 wait_on_page_locked(dirty_page); 2443 lock_page(old_page); /* LRU manipulation */
2351 set_page_dirty_balance(dirty_page, page_mkwrite); 2444 munlock_vma_page(old_page);
2352 } 2445 unlock_page(old_page);
2353 put_page(dirty_page);
2354 if (page_mkwrite) {
2355 struct address_space *mapping = dirty_page->mapping;
2356
2357 set_page_dirty(dirty_page);
2358 unlock_page(dirty_page);
2359 page_cache_release(dirty_page);
2360 if (mapping) {
2361 /*
2362 * Some device drivers do not set page.mapping
2363 * but still dirty their pages
2364 */
2365 balance_dirty_pages_ratelimited(mapping);
2366 }
2367 } 2446 }
2368 2447 page_cache_release(old_page);
2369 /* file_update_time outside page_lock */
2370 if (vma->vm_file)
2371 file_update_time(vma->vm_file);
2372 } 2448 }
2373 return ret; 2449 return ret;
2374oom_free_new: 2450oom_free_new:
@@ -2975,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2975 goto out; 3051 goto out;
2976 } 3052 }
2977 charged = 1; 3053 charged = 1;
2978 /*
2979 * Don't let another task, with possibly unlocked vma,
2980 * keep the mlocked page.
2981 */
2982 if (vma->vm_flags & VM_LOCKED)
2983 clear_page_mlock(vmf.page);
2984 copy_user_highpage(page, vmf.page, address, vma); 3054 copy_user_highpage(page, vmf.page, address, vma);
2985 __SetPageUptodate(page); 3055 __SetPageUptodate(page);
2986 } else { 3056 } else {
@@ -3147,9 +3217,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3147 * but allow concurrent faults), and pte mapped but not yet locked. 3217 * but allow concurrent faults), and pte mapped but not yet locked.
3148 * We return with mmap_sem still held, but pte unmapped and unlocked. 3218 * We return with mmap_sem still held, but pte unmapped and unlocked.
3149 */ 3219 */
3150static inline int handle_pte_fault(struct mm_struct *mm, 3220int handle_pte_fault(struct mm_struct *mm,
3151 struct vm_area_struct *vma, unsigned long address, 3221 struct vm_area_struct *vma, unsigned long address,
3152 pte_t *pte, pmd_t *pmd, unsigned int flags) 3222 pte_t *pte, pmd_t *pmd, unsigned int flags)
3153{ 3223{
3154 pte_t entry; 3224 pte_t entry;
3155 spinlock_t *ptl; 3225 spinlock_t *ptl;
@@ -3228,9 +3298,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 pmd = pmd_alloc(mm, pud, address); 3298 pmd = pmd_alloc(mm, pud, address);
3229 if (!pmd) 3299 if (!pmd)
3230 return VM_FAULT_OOM; 3300 return VM_FAULT_OOM;
3231 pte = pte_alloc_map(mm, pmd, address); 3301 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3232 if (!pte) 3302 if (!vma->vm_ops)
3303 return do_huge_pmd_anonymous_page(mm, vma, address,
3304 pmd, flags);
3305 } else {
3306 pmd_t orig_pmd = *pmd;
3307 barrier();
3308 if (pmd_trans_huge(orig_pmd)) {
3309 if (flags & FAULT_FLAG_WRITE &&
3310 !pmd_write(orig_pmd) &&
3311 !pmd_trans_splitting(orig_pmd))
3312 return do_huge_pmd_wp_page(mm, vma, address,
3313 pmd, orig_pmd);
3314 return 0;
3315 }
3316 }
3317
3318 /*
3319 * Use __pte_alloc instead of pte_alloc_map, because we can't
3320 * run pte_offset_map on the pmd, if an huge pmd could
3321 * materialize from under us from a different thread.
3322 */
3323 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
3233 return VM_FAULT_OOM; 3324 return VM_FAULT_OOM;
3325 /* if an huge pmd materialized from under us just retry later */
3326 if (unlikely(pmd_trans_huge(*pmd)))
3327 return 0;
3328 /*
3329 * A regular pmd is established and it can't morph into a huge pmd
3330 * from under us anymore at this point because we hold the mmap_sem
3331 * read mode and khugepaged takes it in write mode. So now it's
3332 * safe to run pte_offset_map().
3333 */
3334 pte = pte_offset_map(pmd, address);
3234 3335
3235 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3336 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3236} 3337}
@@ -3296,7 +3397,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
3296 vma = find_vma(current->mm, addr); 3397 vma = find_vma(current->mm, addr);
3297 if (!vma) 3398 if (!vma)
3298 return -ENOMEM; 3399 return -ENOMEM;
3299 write = (vma->vm_flags & VM_WRITE) != 0; 3400 /*
3401 * We want to touch writable mappings with a write fault in order
3402 * to break COW, except for shared mappings because these don't COW
3403 * and we would not want to dirty them for nothing.
3404 */
3405 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3300 BUG_ON(addr >= end); 3406 BUG_ON(addr >= end);
3301 BUG_ON(end > vma->vm_end); 3407 BUG_ON(end > vma->vm_end);
3302 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3408 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3474,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
3368 goto out; 3474 goto out;
3369 3475
3370 pmd = pmd_offset(pud, address); 3476 pmd = pmd_offset(pud, address);
3477 VM_BUG_ON(pmd_trans_huge(*pmd));
3371 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3478 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3372 goto out; 3479 goto out;
3373 3480
@@ -3608,3 +3715,74 @@ void might_fault(void)
3608} 3715}
3609EXPORT_SYMBOL(might_fault); 3716EXPORT_SYMBOL(might_fault);
3610#endif 3717#endif
3718
3719#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3720static void clear_gigantic_page(struct page *page,
3721 unsigned long addr,
3722 unsigned int pages_per_huge_page)
3723{
3724 int i;
3725 struct page *p = page;
3726
3727 might_sleep();
3728 for (i = 0; i < pages_per_huge_page;
3729 i++, p = mem_map_next(p, page, i)) {
3730 cond_resched();
3731 clear_user_highpage(p, addr + i * PAGE_SIZE);
3732 }
3733}
3734void clear_huge_page(struct page *page,
3735 unsigned long addr, unsigned int pages_per_huge_page)
3736{
3737 int i;
3738
3739 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3740 clear_gigantic_page(page, addr, pages_per_huge_page);
3741 return;
3742 }
3743
3744 might_sleep();
3745 for (i = 0; i < pages_per_huge_page; i++) {
3746 cond_resched();
3747 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3748 }
3749}
3750
3751static void copy_user_gigantic_page(struct page *dst, struct page *src,
3752 unsigned long addr,
3753 struct vm_area_struct *vma,
3754 unsigned int pages_per_huge_page)
3755{
3756 int i;
3757 struct page *dst_base = dst;
3758 struct page *src_base = src;
3759
3760 for (i = 0; i < pages_per_huge_page; ) {
3761 cond_resched();
3762 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3763
3764 i++;
3765 dst = mem_map_next(dst, dst_base, i);
3766 src = mem_map_next(src, src_base, i);
3767 }
3768}
3769
3770void copy_user_huge_page(struct page *dst, struct page *src,
3771 unsigned long addr, struct vm_area_struct *vma,
3772 unsigned int pages_per_huge_page)
3773{
3774 int i;
3775
3776 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3777 copy_user_gigantic_page(dst, src, addr, vma,
3778 pages_per_huge_page);
3779 return;
3780 }
3781
3782 might_sleep();
3783 for (i = 0; i < pages_per_huge_page; i++) {
3784 cond_resched();
3785 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3786 }
3787}
3788#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */